#' @export
HIVC.COUNTRY.TABLE<- data.table(matrix(c( "AG","ANTIGUA AND BARBUDA", "AL","ALBANIA", "AN", "ANTILLES", "AO","ANGOLA", "AR","ARGENTINA", "AT","AUSTRIA", "AU","AUSTRALIA",
"BB","BARBADOS", "BE","BELGIUM", "BG","BULGARIA", "BR","BRAZIL", "CA","CANADA", "CH","SWITZERLAND", "CI","COTE D'IVOIRE", "CL","CHILE",
"CN","CHINA", "CO","COLOMBIA", "CU","CUBA", "CW", "CURACAO", "CV","CAPE VERDE", "CY","CYPRUS", "CZ","CZECH REPUBLIC", "DE","GERMANY", "DK","DENMARK",
"DO","DOMINICAN REPUBLIC", "DZ","ALGERIA", "EC","ECUADOR", "ER", "ERITREA", "ES","SPAIN", "FI","FINLAND", "FR","FRANCE", "GA","GABON", "GB","UNITED KINGDOM",
"GD","GRENADA", "GE","GIORGIA", "GH","GHANA", "GR","GREECE", "GW","GUINEA-BISSAU", "GY","GUYANA", "HN","HONDURAS", "HT","HAITI",
"HU","HUNGARY", "ID", "INDONESIA", "IL","ISRAEL", "IN","INDIA", "IT","ITALY", "JM","JAMAICA", "JP","JAPAN", "KE","KENYA", "KR","SOUTH KOREA", "LB","LEBANON",
"LU","LUXEMBOURG", "LK", "SRILANKA", "LV","LATVIA", "MA","MOROCCO", "ME","MONTENEGRO", "MG","MADAGASCAR", "ML","MALI", "MN","MONGOLIA", "MX","MEXICO", "MY","MALAYSIA",
"NG","NIGERIA", "NL","NETHERLANDS", "NO","NORWAY", "PA","PANAMA", "PE","PERU", "PH","PHILIPPINES", "PL","POLAND", "PR","PUERTO RICO", "PT","PORTUGAL",
"PY","PARAGUAY", "RO","ROMANIA", "RS","SERBIA", "RU","RUSSIAN FEDERATION", "SC","SEYCHELLES", "SD","SUDAN", "SE","SWEDEN", "SG","SINGAPORE",
"SI","SLOVENIA", "SK","SLOVAKIA", "SN","SENEGAL", "SR","SURINAME", "SV","EL SALVADOR", "TH","THAILAND", "TT","TRINIDAD AND TOBAGO", "TW","TAIWAN",
"UA","UKRAINE", "UG","UGANDA", "US","UNITED STATES", "UY","URUGUAY", "VE","VENEZUELA", "VN","VIETNAM", "YE","YEMEN","ZA","SOUTH AFRICA"),ncol=2,byrow=1,dimnames=list(c(),c("key","country"))))
######################################################################################
project.hivc.clustering<- function(dir.name= DATA)
{
require(ape)
require(data.table)
require(RColorBrewer)
if(0) #plot composition of selected MSM clusters
{
hivc.prog.eval.clustering.bias()
}
if(0)
{
prog.recom.checkcandidates()
}
if(1)
{
project.hivc.clustering.compare.NoDR.to.NoRecombNoDR()
project.seq.dataset.mDR.mRC.mSH.pLANL()
}
if(0) #min brl to get a transmission cascade from brl matrix
{
brlmat <- matrix( c(0,0.1,0.1, 0.1,0,0.2, 0.1,0.2,0), ncol=3, nrow=3)
brlmat <- matrix( c(0,0.1,0.1,0.2, 0.1,0,0.2,0.3, 0.1,0.2,0,0.1, 0.2,0.3,0.1,0), ncol=4, nrow=4)
#brlmat <- matrix( c(0,0.1,0.5,0.5, 0.1,0,0.5,0.5, 0.5,0.5,0,0.1, 0.5,0.5,0.1,0), ncol=4, nrow=4)
#brlmat <- matrix( c(0,0.5,0.5, 0.5,0,0.1, 0.5,0.1,0), ncol=3, nrow=3)
#brlmat <- matrix( c(0,0.5,0.1, 0.5,0,0.5, 0.1,0.5,0), ncol=3, nrow=3)
#brlmat <- 0.5
#brlmat <- matrix(2, ncol=1, nrow=1)
#brlmat <- brlmat[upper.tri(brlmat)]
brl <- hivc.clu.min.transmission.cascade(brlmat)
str(brl)
stop()
}
if(0) #test clustering on simple test tree
{
ph <- "(Wulfeniopsis:0.196108,(((TN_alpinus:0.459325,TN_grandiflora:0.259364)1.00:0.313204,uniflora:1.155678)1.00:0.160549,(((TP_angustibracteata:0.054609,(TN_brevituba:0.085086,TP_stolonifera:0.086001)0.76:0.035958)1.00:0.231339,(((axillare:0.017540,liukiuense:0.018503)0.96:0.038019,stenostachyum:0.049803)1.00:0.083104,virginicum:0.073686)1.00:0.103843)1.00:0.086965,(carinthiaca:0.018150,orientalis:0.019697)1.00:0.194784)1.00:0.077110)1.00:0.199516,(((((abyssinica:0.077714,glandulosa:0.063758)1.00:0.152861,((((allionii:0.067154,(morrisonicola:0.033595,officinalis:0.067266)1.00:0.055175)1.00:0.090694,(alpina:0.051894,baumgartenii:0.024152,(bellidioides:0.016996,nutans:0.063292)0.68:0.031661,urticifolia:0.032044)0.96:0.036973,aphylla:0.117223)0.67:0.033757,(((japonensis:0.018053,miqueliana:0.033676)1.00:0.160576,vandellioides:0.099761)0.69:0.036188,montana:0.050690)1.00:0.058380)1.00:0.115874,scutellata:0.232093)0.99:0.055014)1.00:0.209754,((((((acinifolia:0.112279,reuterana:0.108698)0.94:0.055829,pusilla:0.110550)1.00:0.230282,((davisii:0.053261,serpyllifolia:0.087290)0.89:0.036820,(gentianoides:0.035798,schistosa:0.038522)0.95:0.039292)1.00:0.092830)1.00:0.169662,(((anagalloides:0.018007,scardica:0.017167)1.00:0.135357,peregrina:0.120179)1.00:0.098045,beccabunga:0.069515)1.00:0.103473)1.00:0.287909,(((((((((((agrestis:0.017079,filiformis:0.018923)0.94:0.041802,ceratocarpa:0.111521)1.00:0.072991,amoena:0.229452,(((argute_serrata:0.017952,campylopoda:0.075210)0.64:0.034411,capillipes:0.022412)0.59:0.034547,biloba:0.037143)1.00:0.141513,intercedens:0.339760,((opaca:0.019779,persica:0.035744)0.94:0.038558,polita:0.036762)1.00:0.108620,rubrifolia:0.186799)1.00:0.144789,(((bombycina_11:0.033926,bombycina_bol:0.035290,cuneifolia:0.017300,jacquinii:0.054249,oltensis:0.045755,paederotae:0.051579,turrilliana:0.017117)0.85:0.049052,czerniakowskiana:0.089983)0.93:0.051111,farinosa:0.138075)1.00:0.080565)1.00:0.104525,((albiflora:0.017984,ciliata_Anna:0.032685,vandewateri:0.017610)0.97:0.045649,arguta:0.063057,(catarractae:0.022789,decora:0.049785)0.96:0.048220,((cheesemanii:0.040125,cupressoides:0.146538)1.00:0.067761,macrantha:0.038130)1.00:0.088158,(densifolia:0.090044,formosa:0.116180)0.71:0.046353,(elliptica:0.038650,(odora:0.019325,salicornioides:0.021228)0.94:0.042950,salicifolia:0.020829)0.92:0.043978,(nivea:0.070429,(papuana:0.035003,tubata:0.031140)0.98:0.064379)0.93:0.065336,raoulii:0.109101)0.97:0.076607)0.93:0.085835,chamaepithyoides:0.485601)0.57:0.072713,(ciliata_157:0.069943,lanuginosa:0.052833)1.00:0.098638,(densiflora:0.069429,macrostemon:0.118926)0.92:0.124911,(fruticulosa:0.086891,saturejoides:0.041181)0.94:0.086148,kellererii:0.083762,lanosa:0.263033,mampodrensis:0.103384,nummularia:0.191180,pontica:0.128944,thessalica:0.129197)0.65:0.031006,(arvensis:0.342138,(((((chamaedrys:0.043720,micans:0.032021,vindobonensis:0.033309)0.51:0.034053,micrantha:0.019084)0.64:0.037906,krumovii:0.020175)1.00:0.103875,verna:0.254017)0.81:0.057105,magna:0.112657)1.00:0.104070)1.00:0.101845)1.00:0.149208,(((aznavourii:0.664103,glauca:0.405588)0.85:0.209945,praecox:0.447238)1.00:0.185614,(donii:0.260827,triphyllos:0.176032)1.00:0.194928)1.00:0.611079)0.74:0.055152,((crista:0.591702,(((cymbalaria_Avlan:0.017401,panormitana:0.017609)1.00:0.229508,((cymbalaria_Istanbul:0.028379,trichadena_332:0.016891,trichadena_Mugla:0.019131)1.00:0.196417,lycica_333:0.146772)1.00:0.097646,lycica_192:0.154877)1.00:0.234748,(((hederifolia:0.018068,triloba:0.075784)1.00:0.084865,(sibthorpioides:0.122542,sublobata:0.136951)1.00:0.074683)0.89:0.043623,stewartii:0.040679)1.00:0.596859)1.00:0.237324)0.58:0.057120,javanica:0.133802)1.00:0.137214)1.00:0.269201,(missurica:0.016685,rubra:0.019696)1.00:0.351184)0.54:0.058275)0.52:0.062485,((dahurica:0.023542,longifolia:0.016484,spicata:0.018125)0.95:0.042294,(nakaiana:0.016270,schmidtiana:0.058451)0.88:0.037207)1.00:0.261643)0.55:0.056458)1.00:0.229509,kurrooa:0.100611)0.74:0.068198,(bonarota:0.040842,lutea:0.115316)1.00:0.241657)0.99:0.085772);"
ph <- ladderize( read.tree(text = ph) )
#read bootstrap support values
thresh.bs <- 0.9
ph.node.bs <- as.numeric( ph$node.label )
ph.node.bs[is.na(ph.node.bs)] <- 0
ph.node.bs[c(13,15,27,41,43)] <- thresh.bs-0.05*seq(0.01,length.out=5)
ph.node.bs[ph.node.bs==1] <- seq_along(which(ph.node.bs==1))*0.005 + 0.7
ph$node.label <- ph.node.bs
#read patristic distances
stat.fun <- hivc.clu.min.transmission.cascade
#stat.fun <- max
dist.brl <- hivc.clu.brdist.stats(ph, eval.dist.btw="leaf", stat.fun=stat.fun)
print(dist.brl)
thresh.brl <- quantile(dist.brl,seq(0.1,1,by=0.05))["100%"]
print(quantile(dist.brl,seq(0.1,0.5,by=0.05)))
print(thresh.brl)
#produce clustering
clustering <- hivc.clu.clusterbythresh(ph, thresh.nodesupport=thresh.bs, thresh.brl=thresh.brl, dist.brl=dist.brl, nodesupport=ph.node.bs,retval="all")
print(clustering)
hivc.clu.plot(ph, clustering[["clu.mem"]], highlight.edge.of.tiplabel=c("TN_","TP_"), highlight.edge.of.tiplabel.col= c("red","blue") )
#produce some tip states
ph.tip.state<- rep(1:20, each=ceiling( length( ph$tip.label )/20 ))[seq_len(Ntip(ph))]
states <- data.table(state.text=1:20, state.col=rainbow(20))
states <- states[ph.tip.state,]
hivc.clu.plot.tiplabels( seq_len(Ntip(ph)), matrix(states[,state.text], nrow=1,ncol=nrow(states)), matrix(states[,state.col], nrow=1,ncol=nrow(states)), cex=0.4, adj=c(-1,0.5) )
#get nodes in each cluster
#clu.nodes <- sapply(unique(clustering[["clu.mem"]])[-1],function(clu){ which(clustering[["clu.mem"]]==clu) })
#get boostrap support of index cases
clu.idx <- clustering[["clu.idx"]]-Ntip(ph)
print(clu.idx)
print(ph.node.bs[clu.idx])
stop()
}
if(0)
{
#plot properties of concentrated clusters by region
verbose <- 1
thresh.bs <- 0.85
thresh.brl <- 0.06
infile <- paste("ATHENA_2013_03_CurAll+LANL_Sequences_examlbs100_withinpatientclusters_bs",thresh.bs*100,"_brlmed",thresh.brl*100,"_clusterinfo",sep='')
insignat <- "Sat_Jun_16_17/23/46_2013"
file <- paste(dir.name,"tmp",paste(infile,'_',gsub('/',':',insignat),".R",sep=''),sep='/')
if(verbose) cat(paste("read cluster info from file",file))
load(file)
if(verbose) cat(paste("found seq, n=",nrow(df.seqinfo)))
print(df.seqinfo)
df.seqinfo <- subset(df.seqinfo, !is.na(cluster) )
if(verbose) cat(paste("found seq in clu, n=",nrow(df.seqinfo)))
#remove within patient clusters before out-of-NL taken out
set(df.seqinfo, which( df.seqinfo[,is.na(Patient)] ), "Patient", "NA")
tmp <- df.seqinfo[, list(isWithinCluster= length(unique(Patient))==1), by=cluster]
df.seqinfo <- merge( subset(tmp,!isWithinCluster), df.seqinfo, by="cluster" )
#from between patient clusters, remove within patient seqs and foreign seqs
df.pat <- df.seqinfo[,list(CountryInfection=CountryInfection[1], Trm=Trm[1], Sex=Sex[1], NegT=NegT[1], AnyPos_T1=AnyPos_T1[1], RegionHospital=RegionHospital[1]), by=Patient]
df.clu <- df.seqinfo[, list(Patient=unique(Patient)), by=cluster]
df.clu <- merge(df.clu, df.pat, all.x=1, by="Patient")
#determine if cluster concentrated in region, and where
setkey(df.clu, cluster)
if(verbose) cat(paste("found pat in clu, n=",nrow(df.clu)))
clu.reg <- table( df.clu[,cluster,RegionHospital] )
clu.reg <- clu.reg / matrix(apply(clu.reg, 2, sum), nrow=nrow(clu.reg), ncol=ncol(clu.reg), byrow=1)
clu.regconc <- apply( clu.reg, 2, function(x) any(x>0.4) && length(which(x!=0))>2 ) |
apply( clu.reg, 2, function(x) any(x>0.5) && length(which(x!=0))<=2 )
if(verbose) cat(paste("number of clusters, n=",length(clu.regconc)))
if(verbose) cat(paste("number of spatially concentrated clusters, n=",length(which(clu.regconc))))
tmp <- rownames(clu.reg)
clu.regconc2<- sapply( seq_len(ncol(clu.reg)), function(j)
{
ifelse(!clu.regconc[j], NA, tmp[ which.max(clu.reg[,j]) ] )
})
df.clureg <- data.table(cluster= as.numeric(colnames( clu.reg )), IsRegionConcentrated= clu.regconc, RegionConcentrated=clu.regconc2, key="cluster" )
#determine median AnyPos_T1 time for cluster and add
tmp <- df.clu[,list(cluster.PosT=mean(AnyPos_T1, na.rm=T)),by=cluster]
df.clureg <- merge(df.clureg, tmp, all.x=1, by="cluster")
#merge all
df.seqinfo <- merge( df.seqinfo, df.clureg, all.x=1, by="cluster" )
#plot Bezemer style
file <- paste(dir.name,"tmp",paste(infile,"_spatialvariation_",gsub('/',':',insignat),".pdf",sep=''),sep='/')
df <- df.seqinfo
df[,time:=AnyPos_T1 ]
set(df, which(df[,!IsRegionConcentrated]), "RegionConcentrated", "Bridge")
tmp <- numeric(nrow(df))
tmp2 <- c("N","E","S","W","Amst","Bridge")
for( i in seq_along(tmp2))
tmp[which(df[,RegionConcentrated==tmp2[i]])] <- i
df[,sort1:=factor(tmp, levels=1:6, labels=tmp2) ]
df[,sort2:=cluster.PosT]
tmp <- numeric(nrow(df))
tmp2 <- c("N","E","S","W","Amst")
for( i in seq_along(tmp2))
tmp[which(df[,RegionHospital==tmp2[i]])] <- i
df[,covariate:=factor(tmp, levels=1:5, labels=tmp2) ]
xlab <- "first HIV+ event"
ylab <- paste("clusters BS=",thresh.bs*100," BRL.med=",thresh.brl*100," version=",gsub('/',':',insignat),sep="")
cex.points <- 0.5
#extract clusters one by one in desired order
tmp <- df[,list(sort1=sort1[1], sort2=sort2[1]),by=cluster]
clusters.sortby1 <- as.numeric( tmp[,sort1] )
clusters.sortby2 <- as.numeric( tmp[,sort2] )
clusters.sortby <- order(clusters.sortby1,clusters.sortby2, decreasing=F)
clusters <- tmp[clusters.sortby,cluster]
clusters.levels <- levels(df[,covariate])
clusters <- lapply(clusters,function(x) subset(df,cluster==x,c(time,covariate)) )
ylim <- c(-2,length(clusters))
cols <- brewer.pal(length(clusters.levels),"Set1") #[c(2,1,3,4,5)]
#cols <- c("green","red","blue","grey30")
cols <- sapply(cols, function(x) my.fade.col(x,0.7))
pch <- c(rep(19,length(clusters.levels)))
xlim <- range( df[,time], na.rm=1 )
xlim[1] <- xlim[1]-700
cat(paste("\nwrite plot to",file))
pdf(file,width=7,height=14)
plot(1,1,type='n',bty='n',xlim=xlim,ylim=ylim,ylab=ylab, xaxt='n',yaxt='n',xlab=xlab)
axis.Date(1, seq.Date(xlim[1],xlim[2],by="year"), labels = TRUE )
dummy<- sapply(seq_along(clusters),function(i)
{
cluster.cex <- cex.points #(1 - nrow(subset(clusters[[i]],hregion=="unknown")) / nrow(clusters[[i]])) * cex.points
cluster.ix <- order(as.numeric(clusters[[i]][,time]))
cluster.x <- as.numeric(clusters[[i]][,time])[cluster.ix]
cluster.y <- rep(i,nrow(clusters[[i]]))
cluster.z <- as.numeric(clusters[[i]][,covariate])[cluster.ix]
#print(clusters[[i]][,covariate])
#if(cluster.z[1]!=4)
#lines( c(cluster.x[1],xlim[2]), rep(i,2), col=cols[ cluster.z[1] ] )
points( cluster.x, cluster.y, col=cols[ cluster.z ], pch=pch[1], cex=cluster.cex )
})
pch[pch==19] <- 21
legend("topleft",bty='n',pt.bg=cols,pch=pch,legend=levels(df[,covariate]), col=rep("transparent",length(clusters.levels)))
dev.off()
stop()
}
if(0)
{
#check BEEHIVE sequences
indircov <- paste(DATA,"derived",sep='/')
infilecov <- "ATHENA_2013_03_AllSeqPatientCovariates"
load(paste(indircov,'/',infilecov,".R",sep=''))
df.bee <- as.data.table(read.csv("~/duke/2013_HIV_NL/ATHENA_2013/data/BEEHIVE1_data.csv", stringsAsFactors=0))
setnames(df.bee,"mcode","Patient")
df.bee <- merge(df.all, subset(df.bee,select=Patient), by="Patient")
save(df.bee, file="~/duke/2013_HIV_NL/ATHENA_2013/data/BEEHIVE1_data.R")
}
if(0)
{
verbose <- 1
resume <- 1
#
# precompute clustering stuff
#
patient.n <- 15700
indir <- paste(DATA,"tmp",sep='/')
infile <- "ATHENA_2013_03_CurAll+LANL_Sequences_examlbs100"
insignat <- "Sat_Jun_16_17/23/46_2013"
infile <- "ATHENA_2013_03_NoDRAll+LANL_Sequences_examlbs100"
insignat <- "Thu_Aug_01_17/05/23_2013"
indircov <- paste(DATA,"derived",sep='/')
infilecov <- "ATHENA_2013_03_AllSeqPatientCovariates"
argv <<- hivc.cmd.preclustering(indir, infile, insignat, indircov, infilecov, resume=resume)
argv <<- unlist(strsplit(argv,' '))
clu.pre <- hivc.prog.get.clustering.precompute()
#
# evaluate TPTN for various thresholds
#
if(verbose) cat(paste("compute TPTN for dist.brl.casc"))
argv <<- hivc.cmd.clustering.tptn(indir, infile, insignat, indircov, infilecov, opt.brl="dist.brl.casc", patient.n=patient.n, resume=resume)
argv <<- unlist(strsplit(argv,' '))
clu.tptn.casc <- hivc.prog.get.clustering.TPTN(clu.pre=clu.pre)
if(verbose) cat(paste("compute TPTN for dist.brl.max"))
argv <<- hivc.cmd.clustering.tptn(indir, infile, insignat, indircov, infilecov, opt.brl="dist.brl.max", patient.n=patient.n, resume=resume)
argv <<- unlist(strsplit(argv,' '))
clu.tptn.max <- hivc.prog.get.clustering.TPTN(clu.pre=clu.pre)
#
# compare dist.brl vs dist.max in terms of total patients in clusters
#
file <- paste(indir, paste(infile,"_clust_cascvsmax_patients_",gsub('/',':',outsignat),".pdf",sep=''),sep='/')
pdf(file=file,width=5,height=5)
par(mar=c(4,4,1,0))
dbwpat.cmp <- list( max =lapply(seq_along(select.bs), function(i) clu.tptn.max[["clusters.dbwpat"]][[select.brl[i]]][[select.bs[i]]] ),
casc=lapply(seq_along(select.bs), function(i) clu.tptn.casc[["clusters.dbwpat"]][[select.brl[i]]][[select.bs[i]]] ) )
ylim <- c(0,3100)
#sapply(dbwpat.cmp, function(x) sapply(x, function(z) sum(as.numeric(names(z))*z) ) )
xlim <- c(1,max( sapply(dbwpat.cmp, function(x) sapply(x, function(z) max(as.numeric(names(z)))) ) ))
plot(1,1,type='n',bty='n',xlim=xlim,ylim=ylim, xlab="cluster size", ylab="patients in clusters of size <=x")
lapply(seq_along(dbwpat.cmp), function(i)
{
lapply(seq_along(dbwpat.cmp[[i]]),function(j)
{
z <- dbwpat.cmp[[i]][[j]][-1]
z2 <- as.numeric(names(z))
lines(z2,cumsum(z*z2), col=cols[i], lty=j)
})
})
legend("topleft",bty='n', border=NA, fill= cols, legend= c("max","casc"))
legend("topright",bty='n', border=NA, lty= 1:2, legend= c("BS=0.8, BRL=0.1","BS=0.95, BRL=0.04"))
dev.off()
#
# compare dist.brl vs dist.max in terms of %cov
#
file <- paste(indir, paste(infile,"_clust_cascvsmax_covepi_",gsub('/',':',outsignat),".pdf",sep=''),sep='/')
pdf(file=file,width=5,height=5)
par(mar=c(4,4,1,0))
ylim <- c(0,0.2)
plot(1,1,type='n',bty='n',xlim=xlim,ylim=ylim, xlab="cluster size", ylab="%coverage (of epi) in clusters of size <=x")
lapply(seq_along(dbwpat.cmp), function(i)
{
lapply(seq_along(dbwpat.cmp[[i]]),function(j)
{
z <- dbwpat.cmp[[i]][[j]][-1]
z2 <- as.numeric(names(z))
lines(z2,cumsum(z*z2)/patient.n, col=cols[i], lty=j)
})
})
legend("topleft",bty='n', border=NA, fill= cols, legend= c("max","casc"))
legend("topright",bty='n', border=NA, lty= 1:2, legend= c("BS=0.8, BRL=0.1","BS=0.95, BRL=0.04"))
dev.off()
#
# compare dist.brl vs dist.max in terms of %cov vs %fp for BS=0.8/BRL=0.1/casc vs BS=0.95/BRL=0.05/max
#
covfp.cmp.x <- rbind( clu.tptn.casc[["fp.by.all"]]["0.8",], clu.tptn.max[["fp.by.all"]]["0.95",] )
covfp.cmp.y <- rbind( clu.tptn.casc[["clusters.cov.epidemic"]]["0.8",], clu.tptn.max[["clusters.cov.epidemic"]]["0.95",] )
file <- paste(indir, paste(infile,"_clust_cascvsmax_covepifp_",gsub('/',':',outsignat),".pdf",sep=''),sep='/')
pdf(file=file,width=5,height=5)
par(mar=c(4,4,1,0))
plot(1,1,type='n',bty='n',xlim=range(c(0.01,covfp.cmp.x)),ylim=range(c(0.2,covfp.cmp.y)),xlab="%FP (among all)",ylab="%coverage (of epi)")
dummy <- sapply(seq_len(nrow(covfp.cmp.x)),function(i)
{
points(covfp.cmp.x[i,],covfp.cmp.y[i,],col=cols[i],type='b')
text(covfp.cmp.x[i,],covfp.cmp.y[i,],col=cols[i],labels=as.numeric(colnames(covfp.cmp.y)),adj=c(-0.8,0.5),cex=0.5)
})
legend("bottomright",border=NA,bty='n',fill=cols,legend=c("max","casc"))
dev.off()
}
if(0)
{
#get branch lengths between F2F transmissions, where there must be a missed intermediary
outdir <- indir
outfile <- paste(infile,"_clust_",opt.brl,"_bs",thresh.bs*100,"_brl",thresh.brl*100,'_',"hetsamegender",sep='')
outsignat <- insignat
plot.file <- paste(outdir,'/',outfile,'_',gsub('/',':',outsignat),".pdf",sep='')
clu.female2female <- hivc.clu.getplot.female2female( clu.pre$ph, clu$clustering, df.cluinfo, plot.file=paste(outdir,'/',outfile,'_',gsub('/',':',outsignat),".pdf",sep='') )
clu.missedintermediary <- names(cluphy.brl.bwpat) %in% as.character( unique( clu.female2female$cluphy.df[,cluster] ) )
brl.missedintermediary <- unlist( lapply( which(clu.missedintermediary), function(i) cluphy.brl.bwpat[[i]] ) )
brl.others <- na.omit( unlist( lapply( which(!clu.missedintermediary), function(i) cluphy.brl.bwpat[[i]] ) ) )
brl.breaks <- seq(0,max(brl.others,brl.missedintermediary)*1.1,by=0.02)
brl.cols <- sapply( brewer.pal(3, "Set1"), function(x) my.fade.col(x,0.5) )
par(mfcol=c(1,2))
hist( brl.others, breaks=brl.breaks, col=brl.cols[1], border=NA, freq=T, main='', xlab="branch lengths, others" )
#legend("topright", fill= brl.cols[1:2], legend= c("others","missed intermediary"), bty='n', border=NA)
hist( brl.missedintermediary, breaks=brl.breaks, col=brl.cols[2], border=NA, freq=T, main='', xlab="branch lengths, missed intermediary" )
par(mfcol=c(1,1))
#highlight seq of same patient not in cluster
#select clusters with TN and P51
stop()
}
if(0) #count how many unlinked pairs in clustering
{
verbose <- 1
file <- paste(dir.name,"derived/ATHENA_2013_03_Unlinked_SeroConv_Dead_UnlinkedAll.R", sep='/')
if(verbose) cat(paste("read file",file))
load(file)
#str(unlinked.bytime)
infile <- "ATHENA_2013_03_FirstCurSequences_PROTRT_examlbs100"
signat.in <- "Fri_May_24_12/59/06_2013"
signat.out <- "Fri_Jun_07_09/59/23_2013"
file <- paste(dir.name,"tmp",paste(infile,'_',gsub('/',':',signat.in),".newick",sep=''),sep='/')
cat(paste("read file",file))
#
#set up tree, get boostrap values and patristic distances between leaves
#
ph <- ladderize( read.tree(file) )
ph.node.bs <- as.numeric( ph$node.label )
ph.node.bs[is.na(ph.node.bs)] <- 0
ph.node.bs <- ph.node.bs/100
ph$node.label <- ph.node.bs
#print(quantile(ph.node.bs,seq(0.1,1,by=0.1)))
dist.brl <- hivc.clu.brdist.stats(ph, eval.dist.btw="leaf") #read patristic distances -- this is the expensive step but still does not take very long
#print(quantile(dist.brl,seq(0.1,1,by=0.1)))
#
#convert truly unlinked pairs from Patient name to ph node index
#
df.tips <- data.table(Node=seq_len(Ntip(ph)), Patient=ph$tip.label )
setkey(df.tips, Patient) #use data.table to speed up search
df.tips <- df.all[df.tips]
ph.unlinked.dead <- lapply(seq_along(unlinked.bytime), function(j)
{
as.numeric( sapply(seq_along(unlinked.bytime[[j]]), function(i) subset(df.tips, unlinked.bytime[[j]][i]==Patient, Node) ))
})
ph.unlinked.seroneg <- as.numeric( sapply(names(unlinked.bytime), function(x) subset(df.tips, x==Patient, Node) ))
tmp <- sort( ph.unlinked.seroneg, index.return=1) #sort ph.unlinked.seroneg and return index, so we can also sort ph.unlinked.dead
ph.unlinked.seroneg <- tmp$x
names(ph.unlinked.seroneg) <- names(unlinked.bytime)[tmp$ix]
ph.unlinked.dead <- lapply(tmp$ix, function(i){ ph.unlinked.dead[[i]] })
names(ph.unlinked.dead) <- names(unlinked.bytime)[tmp$ix]
ph.unlinked.seroneg <- data.table(PhNode=ph.unlinked.seroneg, PhNodeUnlinked= seq_along(ph.unlinked.seroneg), Patient= names(ph.unlinked.seroneg))
setkey(ph.unlinked.seroneg, Patient) #set key to take right outer join with df.serocon -- temporarily destroy correspondance with 'ph.unlinked.dead'
setkey(df.serocon, Patient)
ph.unlinked.seroneg <- df.serocon[ph.unlinked.seroneg]
setkey(ph.unlinked.seroneg, PhNode) #use data.table to speed up search -- restore correspondance with 'ph.unlinked.dead'
#print(ph.unlinked.seroneg); print(ph.unlinked.dead); stop()
#
#determine threshold for patristic distances based on truly unlinked pairs
#
thresh.brl <- NULL
thresh.bs <- 0.9
ans <- hivc.clu.clusterbytypeIerror(ph, dist.brl, ph.node.bs, ph.unlinked.seroneg, ph.unlinked.dead, thresh.brl=thresh.brl, thresh.bs=thresh.bs, level= 0.001, verbose=1)
#
#determine quick estimate of expected false pos for these thresholds if clustering was random
#
check <- hivc.clu.exp.typeIerror.randomclu(ph, dist.brl, ph.node.bs, ph.unlinked.seroneg, ph.unlinked.dead, ans[["thresh.brl"]], ans[["thresh.bs"]])
cat(paste("\n#clusters with at least one FP=",check[["fp.n"]]," , %clusters with at least one FP=",check[["fp.rate"]],sep=''))
#
#max cluster size distribution
barplot(cumsum(h$counts), names.arg=seq_along(h$counts)+1, axisnames=1, xlab="maximum cluster size")
#
#number of sequences in cluster, and %
cat(paste("\n#seq in cluster=",sum(ans[["clu"]][["size.tips"]]),", %seq in cluster=",sum(ans[["clu"]][["size.tips"]])/Ntip(ph),sep=''))
#
#plot final clusters, save clusters
#
file <- paste(dir.name,"tmp",paste(infile,"_clu_",gsub('/',':',signat.out),".pdf",sep=''),sep='/')
cat(paste("plot clusters on tree to file",file))
hivc.clu.plot(ph, ans[["clu"]][["clu.mem"]], file=file, pdf.scaley=25)
file <- paste(dir.name,"tmp",paste(infile,"_clu_",gsub('/',':',signat.out),".R",sep=''),sep='/')
cat(paste("save analysis to file",file))
save(ans,check,ph,dist.brl,ph.node.bs,ph.unlinked.seroneg,ph.unlinked.dead, file=file)
}
}
project.ATHENA0313.process.recombinants<- function()
{
require(recombination.analyzer)
indir <- paste(DATA,"tmp",sep='/')
infile <- "ATHENA_2013_03_NoDRAll+LANL_Sequences"
#infile <- "ATHENA_2013_03_NoDRAll+LANL_Sequences100"
insignat <- "Thu_Aug_01_17/05/23_2013"
pipeline.recom.run.3seq(indir, infile, insignat, batch.n=100, hpc.walltime=35, hpc.q='pqeph', hpc.mem="3850mb", hpc.nproc=1)
pipeline.recom.get.phyloincongruence.for.candidates(indir, infile, insignat, resume=0, verbose=1, hpc.walltime=35, hpc.q=NA, hpc.mem="600mb", hpc.nproc=1)
pipeline.recom.plot.phyloincongruence.for.candidates(indir, infile, insignat, resume=0, verbose=1)
}
# collect likely recombinants or those likely confounding the phylogeny -- identified by eye
project.ATHENA0313.exclude.recombinants<- function()
{
verbose <- 1
argv <<- cmd.recombination.process.3SEQ.output(indir, infile, insignat, resume=1, verbose=1)
argv <<- unlist(strsplit(argv,' '))
df.recomb <- prog.recom.process.3SEQ.output()
setnames(df.recomb, "dummy", "triplet.id")
setkey(df.recomb, triplet.id)
# collect likely recombinants or those likely confounding the phylogeny
recombinants.ng2 <- c( subset( df.recomb, triplet.id==60 )[,child], #likely confounding
subset( df.recomb, triplet.id==52 )[,child],
subset( df.recomb, triplet.id==55 )[,parent2],"R09-26706","R11-12152","R12-15108", #others cluster in addition
subset( df.recomb, triplet.id==48 )[,parent2],"2007G319", #others cluster in addition
subset( df.recomb, triplet.id==112 )[,child],
subset( df.recomb, triplet.id==129 )[,child],
subset( df.recomb, triplet.id==148 )[,child], #length only 50
subset( df.recomb, triplet.id==135 )[,child],
subset( df.recomb, triplet.id==102 )[,child], #likely confounding
subset( df.recomb, triplet.id==85 )[,child],
subset( df.recomb, triplet.id==81 )[,child], #likely confounding
subset( df.recomb, triplet.id==125 )[,child], #likely confounding but might be recombinant
subset( df.recomb, triplet.id==120 )[,child] )
recombinants.g2 <- c( "2006G052", "2007G263", "M3621708072010", "M4048713072011",
"M4203226082011", "R03-14636", "TN_B.HT.2005.05HT_129336.EU439719", "TN_B.HT.2004.04HT_129732.EU439728", "TN_B.PH.2008.08R_01_361.AB587101", "TN_B.ZA.2011.patient_1720_seq_1746.KC423374", "TN_B.ZA.2012.patient_1720_seq_2734.KC423805", "TN_B.DO.2008.HIV_PRRT_PJ01967_48.JN713614", #these cluster with M4048713072011
"R11-15440", "R12-00343", "R10-09812", #these are children of R08-20970
"R12-07939" )
recombinants <- c( recombinants.ng2, recombinants.g2 )
if(verbose) cat(paste("\ncollected recombinants or likely confounding sequences, n=",length(recombinants)))
#
# save non-recombinant sequence dataset
#
indir <- paste(DATA,"tmp",sep='/')
infile <- "ATHENA_2013_03_NoDRAll+LANL_Sequences"
insignat <- "Thu_Aug_01_17/05/23_2013"
outfile <- "ATHENA_2013_03_NoRCDRAll+LANL_Sequences"
outsignat <- "Fri_Nov_01_16/07/23_2013"
file <- paste(indir,'/',infile,'_',gsub('/',':',insignat),".R",sep='')
load(file)
tmp <- setdiff(rownames(seq.PROT.RT), recombinants)
seq.PROT.RT <- seq.PROT.RT[tmp,]
if(verbose) cat(paste("\nnumber of sequences without (likely) recombinants, n=",nrow(seq.PROT.RT)))
file <- paste(indir,'/',outfile,'_',gsub('/',':',outsignat),".R",sep='')
if(verbose) cat(paste("\nsave new file to",file))
save(seq.PROT.RT, file=file)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.