##--------------------------------------------------------------------------------------------------------
## olli 23.10.15
##--------------------------------------------------------------------------------------------------------
#
# compute path differences on complete trees
#
treedist.pathdifference.wrapper<- function(df, ttrs, s, use.brl=FALSE, use.weight=FALSE)
{
#tmp <- subset(submitted.info, IDX==463)[1,]
#IDX<- 463
#IDX_T<-7
tmp <- df[, {
cat('\nAt IDX', IDX)
tidx <- ifelse(use.brl, SUB_IDX_T, TIME_IDX_T)
#IDX<- 241; TIME_IDX_T<- 6; IDX<- 1; TIME_IDX_T<- 1
stree <- s[[IDX]]
otree <- ttrs[[tidx]]
z <- setdiff(otree$tip.label, stree$tip.label)
stopifnot( length(z)==abs(diff(c(Ntip(otree), Ntip(stree)))) )
if(length(z))
otree <- drop.tip(otree, z)
z <- path.dist(otree, stree, use.weight=use.weight)
list(PD=z, NPD=z/choose(Ntip(otree),2), NPDSQ=z/sqrt(choose(Ntip(otree),2)), TAXA_NJ=Ntip(otree))
}, by='IDX']
tmp
}
#--------------------------------------------------------------------------------------------------------
# MSE between true distances and patristic distances (units time) in reconstructed tree
#--------------------------------------------------------------------------------------------------------
treedist.MSE.wrapper<- function(df, s, tbrl, tinfo, use.brl=TRUE)
{
ans <- df[, {
#IDX<- 724; TIME_IDX_T<- 13; SUB_IDX_T<- 2
#IDX<- 241; TIME_IDX_T<- 6; SUB_IDX_T<- 4
tmp2 <- tmp3 <- mse <- mae <- mse.tp <- mae.tp <- NULL
gc()
cat('\nLSD distances IDX at', IDX)
tidx <- ifelse(use.brl, SUB_IDX_T, TIME_IDX_T)
stree <- s[[IDX]]
# mean squared error and mean absolute error of all pairwise distances
tmp3 <- cophenetic.phylo(stree)
#tmp3 <- distTips(stree, seq_len(Ntip(stree)), method='patristic', useC=TRUE)
#tmp3 <- as.matrix(tmp3)
tmp3[upper.tri(tmp3, diag=TRUE)] <- NA_real_
tmp3 <- as.data.table(melt(tmp3))
setnames(tmp3, c('Var1','Var2','value'),c('TAXA1','TAXA2','PD_SIM'))
tmp3 <- subset(tmp3, !is.na(PD_SIM))
tmp2 <- subset(tbrl, IDX_T==tidx)
set(tmp2,NULL,'TAXA1',tmp2[, as.character(TAXA1)])
set(tmp2,NULL,'TAXA2',tmp2[, as.character(TAXA2)])
set(tmp3,NULL,'TAXA1',tmp3[, as.character(TAXA1)])
set(tmp3,NULL,'TAXA2',tmp3[, as.character(TAXA2)])
tmp2 <- merge(tmp3, tmp2, by=c('TAXA1','TAXA2'))
stopifnot(nrow(tmp2)==Ntip(stree)*(Ntip(stree)-1)/2)
mse <- tmp2[, mean((PD-PD_SIM)*(PD-PD_SIM))]
mae <- tmp2[, mean(abs(PD-PD_SIM))]
# mean squared error and mean absolute error of pairwise distances of sampled transmission pairs
set(tmp2,NULL,'TAXA1',tmp2[, as.integer(gsub('IDPOP_','',gsub('\\|.*','',as.character(TAXA1))))])
set(tmp2,NULL,'TAXA2',tmp2[, as.integer(gsub('IDPOP_','',gsub('\\|.*','',as.character(TAXA2))))])
setnames(tmp2,c('TAXA1','TAXA2'),c('IDPOP','IDTR'))
tmp3 <- subset(tinfo, IDX_T==tidx & IDTR_SAMPLED=='Y', select=c(IDPOP, IDTR))
tmp3 <- unique(tmp3, by=c('IDPOP','IDTR'))
tmp <- copy(tmp3)
setnames(tmp, c('IDPOP','IDTR'), c('IDTR','IDPOP'))
tmp3 <- rbind(tmp3, tmp, use.names=TRUE)
set(tmp3,NULL,'IDPOP',tmp3[, as.integer(gsub('IDPOP_','',IDPOP))])
set(tmp3,NULL,'IDTR',tmp3[, as.integer(gsub('IDPOP_','',IDTR))])
tmp2 <- merge(tmp2,tmp3,by=c('IDPOP','IDTR'))
mse.tp <- tmp2[, mean((PD-PD_SIM)*(PD-PD_SIM))]
mae.tp <- tmp2[, mean(abs(PD-PD_SIM))]
list(MSE=mse, MAE=mae, MSE_TP=mse.tp, MAE_TP=mae.tp, TAXA_NJ=Ntip(stree), EDGE_NJ=nrow(tmp2))
}, by='IDX']
ans
}
#--------------------------------------------------------------------------------------------------------
# MSE between true distances and patristic distances (units time) for each cluster in reconstructed tree
#--------------------------------------------------------------------------------------------------------
treedist.MSE.clusters.wrapper<- function(df, s, tbrl, tinfo, use.brl=TRUE)
{
#
setkey(tinfo, IDX_T)
ans <- df[, {
cat('\nAt IDX', IDX)
# IDX<- 724; TIME_IDX_T<- 13; SUB_IDX_T<- 2
tidx <- ifelse(use.brl, SUB_IDX_T, TIME_IDX_T)
stree <- s[[IDX]]
# get all clusters of this true tree (with IDX_T) that are of size>=3 (use "tinfo" for that)
z <- subset(tinfo, CLU_N>3 & IDX_T==tidx)
z <- unique(z, by='TAXA')
z <- merge(z, data.table(TAXA=stree$tip.label, IN_STREE=1), by='TAXA', all.x=1)
# calculate the size of these clusters in the simulated tree
z <- merge(z, z[, list(CLU_NS= length(which(IN_STREE==1))), by='IDCLU'], by='IDCLU')
# get all clusters of size >= 3 in both the simulated and true tree
z <- subset(z, CLU_NS>3)
# precompute what can be precomputed before next loop
tmp2 <- subset(tbrl, IDX_T==tidx)
set(tmp2,NULL,'TAXA1',tmp2[, as.character(TAXA1)])
set(tmp2,NULL,'TAXA2',tmp2[, as.character(TAXA2)])
set(tmp2,NULL,'IDPOP',tmp2[, as.integer(gsub('IDPOP_','',gsub('\\|.*','',as.character(TAXA1))))])
set(tmp2,NULL,'IDTR',tmp2[, as.integer(gsub('IDPOP_','',gsub('\\|.*','',as.character(TAXA2))))])
#
tmp4 <- unique(subset(tinfo, IDX_T==tidx & IDTR_SAMPLED=='Y', select=c(IDPOP, IDTR)), by=c('IDPOP','IDTR'))
tmp <- copy(tmp4)
setnames(tmp, c('IDPOP','IDTR'), c('IDTR','IDPOP'))
tmp4 <- rbind(tmp4, tmp, use.names=TRUE)
set(tmp4,NULL,'IDPOP',tmp4[, as.integer(gsub('IDPOP_','',IDPOP))])
set(tmp4,NULL,'IDTR',tmp4[, as.integer(gsub('IDPOP_','',IDTR))])
# if there any such clusters, calculate the quartet distance
if(nrow(z))
{
#IDCLU <- 3; TAXA <- subset(z, IDCLU==3)[, TAXA]
ans <- z[, {
sclu <- drop.tip(stree, setdiff(stree$tip.label,TAXA))
# mean squared error of all pairwise distances
tmp3 <- cophenetic.phylo(sclu)
tmp3[upper.tri(tmp3, diag=TRUE)] <- NA_real_
tmp3 <- as.data.table(melt(tmp3))
setnames(tmp3, c('Var1','Var2','value'),c('TAXA1','TAXA2','PD_SIM'))
tmp3 <- subset(tmp3, !is.na(PD_SIM))
set(tmp3,NULL,'TAXA1',tmp3[, as.character(TAXA1)])
set(tmp3,NULL,'TAXA2',tmp3[, as.character(TAXA2)])
# this merges to the intersection of taxa in sclu and the corresponding observed clu
tmp3 <- merge(tmp3, tmp2, by=c('TAXA1','TAXA2'))
mse <- tmp3[, mean((PD-PD_SIM)*(PD-PD_SIM))]
mae <- tmp3[, mean(abs(PD-PD_SIM))]
# mean squared error and mean absolute error of pairwise distances of sampled transmission pairs
tmp3 <- merge(tmp3,tmp4,by=c('IDPOP','IDTR'))
mse.tp <- tmp3[, mean((PD-PD_SIM)*(PD-PD_SIM))]
mae.tp <- tmp3[, mean(abs(PD-PD_SIM))]
list(MSE=mse, MAE=mae, MSE_TP=mse.tp, MAE_TP=mae.tp, TAXA_NC=Ntip(sclu), EDGE_NC=nrow(tmp2))
}, by='IDCLU']
}
if(!nrow(z))
ans <- data.table(MSE=NA_real_, MAE=NA_real_, MSE_TP=NA_real_, MAE_TP=NA_real_, TAXA_NC=NA_integer_, EDGE_NC=NA_integer_)
ans
}, by='IDX']
ans
}
#--------------------------------------------------------------------------------------------------------
#
#--------------------------------------------------------------------------------------------------------
treedist.pathdifference.clusters.wrapper<- function(df, ttrs, s, tinfo, use.brl=TRUE, use.weight=FALSE)
{
#
setkey(tinfo, IDX_T)
tmp <- df[, {
cat('\nAt IDX', IDX)
# IDX<- 1; SUB_IDX_T<- 1
stree <- s[[IDX]]
tidx <- ifelse(use.brl, SUB_IDX_T, TIME_IDX_T)
otree <- ttrs[[tidx]]
# get all clusters of this true tree (with IDX_T) that are of size>=3 (use "tinfo" for that)
z <- subset(tinfo, CLU_N>3 & IDX_T==tidx)
z <- unique(z, by='TAXA')
z <- merge(z, data.table(TAXA=stree$tip.label, IN_STREE=1), by='TAXA', all.x=1)
# calculate the size of these clusters in the simulated tree
z <- merge(z, z[, list(CLU_NS= length(which(IN_STREE==1))), by='IDCLU'], by='IDCLU')
# get all clusters of size >= 3 in both the simulated and true tree
z <- subset(z, CLU_NS>3)
# if there any such clusters, calculate the quartet distance
if(nrow(z))
{
#IDCLU <- 6; TAXA <- subset(z, IDCLU==6)[, TAXA]
ans <- z[, {
sclu <- drop.tip(stree, setdiff(stree$tip.label,TAXA))
oclu <- drop.tip(otree, union( setdiff(otree$tip.label, stree$tip.label), setdiff(otree$tip.label,TAXA)))
z <- path.dist(oclu, sclu, use.weight=use.weight)
list(PD=z, NPD=z/choose(Ntip(oclu),2), NPDSQ=z/sqrt(choose(Ntip(oclu),2)), TAXA_NC=Ntip(oclu))
}, by='IDCLU']
}
if(!nrow(z))
ans <- data.table(PD=NA_real_, NPD=NA_real_, NPDSQ=NA_real_, TAXA_NC=NA_integer_)
ans
}, by='IDX']
tmp
}
##--------------------------------------------------------------------------------------------------------
## olli 01.08.16 compute quartet differences on complete trees
##--------------------------------------------------------------------------------------------------------
treedist.quartetdifference.wrapper<- function(submitted.info, ttrs, strs_rtt)
{
tmp <- submitted.info[, {
cat('\nAt IDX', IDX)
#IDX<- 241; TIME_IDX_T<- 6
stree <- unroot(strs_rtt[[IDX]])
otree <- unroot(ttrs[[TIME_IDX_T]])
#print(stree)
#print(otree)
z <- setdiff(otree$tip.label, stree$tip.label)
stopifnot( length(z)==abs(diff(c(Ntip(otree), Ntip(stree)))) )
if(length(z))
otree <- unroot(drop.tip(otree, z))
z <- quartets.distance.cmd(otree, stree)
list(TAXA_NJ=z['TAXA_NJ'], NQD=z['NQD'])
}, by='IDX']
tmp
}
#--------------------------------------------------------------------------------------------------------
# olli 01.08.16 compute quartet differences on sub trees
#--------------------------------------------------------------------------------------------------------
treedist.quartetdifference.clusters.wrapper<- function(submitted.info, ttrs, strs_rtt, tinfo, pr.qdist='/apps/qdist/2.0/bin/qdist')
{
setkey(tinfo, IDX_T)
tmp <- subset(submitted.info, MODEL=='R')[, {
cat('\nAt IDX', IDX)
# IDX<- 1; SUB_IDX_T<- 1
stree <- strs_rtt[[IDX]]
otree <- ttrs[[SUB_IDX_T]]
# get all clusters of this true tree (with IDX_T) that are of size>=3 (use "tinfo" for that)
z <- subset(tinfo, CLU_N>3 & IDX_T==SUB_IDX_T)
z <- unique(z, by='TAXA')
z <- merge(z, data.table(TAXA=stree$tip.label, IN_STREE=1), by='TAXA', all.x=1)
# calculate the size of these clusters in the simulated tree
z <- merge(z, z[, list(CLU_NS= length(which(IN_STREE==1))), by='IDCLU'], by='IDCLU')
# get all clusters of size >= 3 in both the simulated and true tree
z <- subset(z, CLU_NS>3)
# if there any such clusters, calculate the quartet distance
if(nrow(z))
{
#IDCLU <- 6; TAXA <- subset(z, IDCLU==6)[, TAXA]
ans <- z[, {
sclu <- unroot(drop.tip(stree, setdiff(stree$tip.label,TAXA)))
oclu <- unroot(drop.tip(otree, union( setdiff(otree$tip.label, stree$tip.label), setdiff(otree$tip.label,TAXA))))
z <- quartets.distance.cmd(oclu, sclu, PROG.QDIST=pr.qdist)
list(NQDC=z['NQD'], TAXA_NC=Ntip(oclu))
}, by='IDCLU']
}
if(!nrow(z))
ans <- data.table(IDCLU=NA_integer_, NQDC=NA_real_, TAXA_NC=NA_integer_)
ans
}, by='IDX']
tmp
}
#--------------------------------------------------------------------------------------------------------
#
#--------------------------------------------------------------------------------------------------------
project.PANGEA.visualize.call.patterns.align150623<- function()
{
outdir <- '/Users/Oliver/Dropbox (SPH Imperial College)/2016_PANGEA_treecomp/figures'
min.coverage <- 600
min.depth <- 10
#with.gaps <- 1
#outfile <- '150623_PANGEAGlobal2681_C5_wgaps.pdf'
with.gaps <- 0
outfile <- '150623_PANGEAGlobal2681_C10.pdf'
infile <- '/Users/Oliver/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/PANGEA_150623/150623_PANGEAGlobal2681_C10.fa'
so <- read.dna(infile, format='fasta')
rownames(so) <- gsub('-','_',rownames(so))
# drop LTR from sequences
# need to load latest alignment, get LTR start, and translate into one common sequence
infile <- '~/Dropbox (SPH Imperial College)/2015_PANGEA_DualPairsFromFastQIVA/alignments_160110/PANGEA_HIV_n5003_Imperial_v160110_GlobalAlignment.rda'
load(infile) #loads sqi, sq
tmp <- sq[which(grepl('HXB2',rownames(sq))),]
pattern <- 'a-*t-*g-*g-*g-*t-*g-*c-*g-*a-*g-*a-*g-*c-*g-*t-*c-*a'
idx.st.in.sq <- regexpr(pattern, paste(as.character( tmp ),collapse=''))
tx.common <- intersect( rownames(so) , rownames(sq) )[1]
pattern <- paste(as.character( sq[tx.common, idx.st.in.sq:(idx.st.in.sq+10)]),collapse='')
idx.st.in.so <- regexpr(pattern, paste(as.character(so[tx.common,]), collapse=''))
so <- so[, seq.int(idx.st.in.so, ncol(so))]
# drop sites outside the HIV reference compendium
#tx.common <- tail( intersect( rownames(so) , rownames(sq) ), 1 )
tx.common <- 'PG14_ZA100095_S01002'
tmp <- gsub('-|?','', paste( as.character(sq[tx.common,]), collapse='') )
pattern <- substring(tmp, nchar(tmp)-20, nchar(tmp) )
pattern <- paste(strsplit(pattern, '')[[1]], collapse='-*')
tmp <- so[tx.common, ]
idx.end.in.so <- regexpr(pattern, paste(as.character( tmp ),collapse=''))
idx.end.in.so <- as.integer(idx.end.in.so+attr(idx.end.in.so,'match.length')-1L)
so <- so[, seq.int(1, idx.end.in.so)]
# drop gap only columns
if(!with.gaps)
{
tmp <- apply( as.character(so), 2, function(x) !all(x%in%c('?','-','n')) )
so <- so[, tmp]
}
# convert into chunks
ch <- lapply(seq_len(nrow(so)), function(i)
{
z <- gregexpr('1+', paste(as.numeric( !as.character( so[i,] )%in%c('-','?','n') ), collapse='') )[[1]]
data.table(PANGEA_ID= rownames(so)[i], POS=as.integer(z), DEPTH=min.depth, REP=attr(z,"match.length"))
})
ch <- do.call('rbind',ch)
set(ch, NULL, 'SITE', ch[, regmatches(PANGEA_ID, regexpr('_[A-Z]+',PANGEA_ID))])
set(ch, NULL, 'SITE', ch[, substring(SITE,2)])
ch <- merge(ch, ch[, list(COV=sum(REP)), by='PANGEA_ID'], by='PANGEA_ID')
# select min.coverage, select min.depth
ch <- subset(ch, !is.na(SITE) & COV>=min.coverage & DEPTH>=min.depth)
# define chunks
ch[, POS_NEXT:= POS+REP]
ch <- ch[, list(SITE=SITE, POS=POS, DEPTH=DEPTH, REP=REP, CHUNK=cumsum(as.numeric(c(TRUE, POS[-1]!=POS_NEXT[-length(POS_NEXT)])))), by='PANGEA_ID']
ch <- ch[, list(SITE=SITE[1], POS_CH=min(POS), REP_CH=sum(REP), DEPTH_CH= sum(DEPTH*REP)/sum(REP) ), by=c('PANGEA_ID','CHUNK')]
ch[, DEPTH_MIN:=min.depth]
set(ch, NULL, 'SITE', ch[, factor(SITE, levels=c('BW', 'ZA', 'UG'), labels=c('Botswana', 'South Africa', 'Uganda'))])
ch <- merge(ch, ch[, list(COV=sum(REP_CH)), by='PANGEA_ID'], by='PANGEA_ID')
ch[, COVP:= COV/ncol(so)]
#
setkey(ch, PANGEA_ID)
dcast.data.table(unique(ch, by='PANGEA_ID')[, list(P=seq(0,1,0.1), Q=quantile(COV, p=seq(0,1,0.1))), by='SITE'], P~SITE, value.var='Q')
dcast.data.table(unique(ch, by='PANGEA_ID')[, list(Q=seq(1e3,8e3,1e3), P=ecdf(COV)(seq(1e3,8e3,1e3))), by='SITE'], Q~SITE, value.var='P')
# proportion of non-gaps in alignment
#unique(ch)[, sum(COV)] / (nrow(unique(ch))*ncol(so))
nrow(unique(ch))
# C5: 2367
# C10: 2126
unique(ch)[, mean(COVP)]
# C5: 0.6201531
# C10: 0.6510485
unique(ch)[, mean(COV)]
# C5: 5472.231
# C10: 5744.852
# plot chunks
require(viridis)
ch <- merge(ch, ch[, list(POS_CHF=min(POS_CH)), by='PANGEA_ID'], by='PANGEA_ID')
setkey(ch, SITE, PANGEA_ID)
tmp <- unique(ch, by=c('SITE','PANGEA_ID'))
setkey(tmp, POS_CHF)
if(min.depth==5)
tmp[, PLOT:=ceiling(seq_len(nrow(tmp))/600)]
if(min.depth==10)
tmp[, PLOT:=ceiling(seq_len(nrow(tmp))/535)]
ch <- merge(ch, subset(tmp, select=c(PANGEA_ID, PLOT)), by='PANGEA_ID')
set(ch, NULL, 'PLOT', ch[, factor(PLOT, levels=c(4,3,2,1), labels=c(4,3,2,1))])
setkey(ch, POS_CH, SITE)
set(ch, NULL, 'PANGEA_ID', ch[, factor(PANGEA_ID, levels=unique(PANGEA_ID), labels=unique(PANGEA_ID))])
ggplot(ch, aes(y=PANGEA_ID, yend=PANGEA_ID, x=POS_CH, xend=POS_CH+REP_CH-1L, colour=SITE)) +
scale_x_continuous(expand=c(0,0), breaks=seq(0,10e3,1e3), minor_breaks=seq(0,10e3,100)) +
scale_colour_manual(values=c('Botswana'="#1B0C42FF", 'South Africa'="#CF4446FF", 'Uganda'="#781C6DFF")) +
geom_segment() + theme_bw() +
facet_wrap(~PLOT, scales='free_y', ncol=4) +
labs(x='\nalignment position', y='PANGEA-HIV sequences\n', colour='sampling\nlocation') +
theme( axis.text.y=element_blank(), axis.ticks.y=element_blank(), axis.line.y=element_blank(), legend.position='bottom',
strip.text= element_blank(), strip.background=element_blank()) +
guides(colour=guide_legend(override.aes=list(size=5)))
ggsave(file=file.path(outdir,outfile), w=15, h=10, limitsize = FALSE)
}
#--------------------------------------------------------------------------------------------------------
#
#--------------------------------------------------------------------------------------------------------
project.PANGEA.visualize.call.patterns.align160110<- function()
{
min.coverage <- 600
min.depth <- 10
infile <- '/Users/Oliver/Dropbox (SPH Imperial College)/2015_PANGEA_DualPairsFromFastQIVA/readlengths/bam_stats_150218.rda'
load(infile)
setnames(bam.cov, c('FILE_ID','COV'), c('SANGER_ID','DEPTH'))
wdir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/explaingaps'
wfile <- 'PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment.rda'
load(file.path(wdir,wfile)) #loads sqi, sq, dm
outdir <- '~/Dropbox (SPH Imperial College)/2016_PANGEA_treecomp/figures'
#
# merge MRC Uganda
#
set(dm, dm[, which(grepl('MRC',COHORT))], 'COHORT', 'UG-MRC')
#
# convert sqp into chunks
#
ch <- lapply(seq_len(nrow(sqp)), function(i)
{
z <- gregexpr('1+', paste(as.numeric( !as.character( sqp[i,] )%in%c('-','?','n') ), collapse='') )[[1]]
data.table(PANGEA_ID= rownames(sqp)[i], POS=as.integer(z), DEPTH=min.depth, REP=attr(z,"match.length"))
})
ch <- do.call('rbind',ch)
# define SITE
tmp <- subset(dm, select=c(TAXA, COHORT))
setnames(tmp, c('TAXA','COHORT'),c('PANGEA_ID','SITE'))
ch <- merge(ch, tmp, by='PANGEA_ID')
# get coverage
ch <- merge(ch, ch[, list(COV=sum(REP)), by='PANGEA_ID'], by='PANGEA_ID')
# select min.coverage, select min.depth
ch <- subset(ch, COV>=min.coverage & DEPTH>=min.depth)
# define chunks
ch[, POS_NEXT:= POS+REP]
ch <- ch[, list(SITE=SITE, POS=POS, DEPTH=DEPTH, REP=REP, CHUNK=cumsum(as.numeric(c(TRUE, POS[-1]!=POS_NEXT[-length(POS_NEXT)])))), by='PANGEA_ID']
ch <- ch[, list(SITE=SITE[1], POS_CH=min(POS), REP_CH=sum(REP), DEPTH_CH= sum(DEPTH*REP)/sum(REP) ), by=c('PANGEA_ID','CHUNK')]
ch[, DEPTH_MIN:=min.depth]
set(ch, NULL, 'SITE', ch[, factor(SITE, levels=c("AC_Resistance","BW-Mochudi","UG-MRC","RCCS"), labels=c('Africa Centre\n(Resistance Cohort)', 'Botswana\n(Mochudi)', 'Uganda-\nMRC','Rakai Community\nCohort Study'))])
ch <- merge(ch, ch[, list(COV=sum(REP_CH)), by='PANGEA_ID'], by='PANGEA_ID')
ch[, COVP:= COV/ncol(sq)]
ch <- merge(ch, ch[, list(POS_CHF=min(POS_CH)), by='PANGEA_ID'], by='PANGEA_ID')
#
# plot chunks
#
require(viridis)
setkey(ch, SITE, PANGEA_ID)
tmp <- unique(ch, by=c('SITE','PANGEA_ID'))
setkey(tmp, POS_CHF)
tmp[, PLOT:=ceiling(seq_len(nrow(tmp))/ceiling(nrow(tmp)/4))]
ch <- merge(ch, subset(tmp, select=c(PANGEA_ID, PLOT)), by='PANGEA_ID')
set(ch, NULL, 'PLOT', ch[, factor(PLOT, levels=c(4,3,2,1), labels=c(4,3,2,1))])
setkey(ch, POS_CH, SITE)
set(ch, NULL, 'PANGEA_ID', ch[, factor(PANGEA_ID, levels=unique(PANGEA_ID), labels=unique(PANGEA_ID))])
dpani <- subset(dpan, !is.na(START))[, list(START=START[1], END=START[1]+max(IDX)-1L), by='PR']
dgeni <- subset(dgene, GENE%in%c('GAG','POL','ENV'))
ggplot(ch) +
scale_x_continuous(expand=c(0,0), breaks=seq(0,10e3,1e3), minor_breaks=seq(0,10e3,100)) +
scale_colour_manual(values=c('Botswana\n(Mochudi)'="#33638DFF", 'Africa Centre\n(Resistance Cohort)'="#CF4446FF", 'Rakai Community\nCohort Study'="#A8327DFF", 'Uganda-\nMRC'="#29AF7FFF")) +
geom_segment(aes(y=PANGEA_ID, yend=PANGEA_ID, x=POS_CH, xend=POS_CH+REP_CH-1L, colour=SITE)) +
geom_rect(data=dpani, aes(xmin=START, xmax=END, ymin=-Inf, ymax=Inf), fill="black", size=5) +
#geom_text(data=dpani, aes(x=START, y=seq_len(length(START))*10+10, label=PR), colour="black", hjust=-.2, size=2) +
geom_vline(data=dgeni, aes(xintercept=START), col='blue') +
geom_vline(data=dgeni, aes(xintercept=END), col='blue') +
#geom_text(data=dgeni, aes(x=START, y=seq_len(length(START))*10+10, label=GENE), colour="blue", hjust=-.2, size=2) +
#geom_text(data=dgeni, aes(x=END, y=seq_len(length(END))*10+10, label=GENE), colour="blue", hjust=-.2, size=2) +
theme_bw() +
facet_wrap(~PLOT, scales='free_y', ncol=4) +
labs(x='\nalignment position', y='PANGEA-HIV sequences\n', colour='sampling\nlocation') +
theme( axis.text.y=element_blank(), axis.ticks.y=element_blank(), axis.line.y=element_blank(), legend.position='bottom',
strip.text= element_blank(), strip.background=element_blank()) +
guides(colour=guide_legend(override.aes=list(size=5)))
ggsave(file=file.path(outdir,'PANGEA_HIV_n5003_Imperial_v160110_selectedchunks_160905.pdf'), w=15, h=15, limitsize=FALSE)
#
# for comparison, ATHENA data set
#
infile <- "/Users/Oliver/duke/2013_HIV_NL/ATHENA_2013/data/tmp/ATHENA_2013_03_NoDRAll+LANL_Sequences_Thu_Aug_01_17:05:23_2013.fasta"
sq <- read.dna(infile, format='fa')
sq <- sq[1:1293,]
sqi <- data.table(FASTA= rownames(sq))
sqi[, FRGN:= grepl('^TN', FASTA) | grepl('PROT+P51', FASTA)]
sq <- sq[ subset(sqi, !FRGN)[, FASTA], ]
# remove all gap cols
sq <- as.character(sq)
tmp <- apply(sq, 2, function(x) !all(x%in%c('-','n','?')))
sq <- sq[,tmp]
# count coverage
sqi <- subset(sqi, !FRGN)[, list(COV=length(which( !sq[FASTA,]%in%c('-','n','?') ))), by='FASTA']
sqi[, COVP:= COV/ncol(sq)]
sqi[, mean(COVP)]
# 0.96
}
##--------------------------------------------------------------------------------------------------------
## olli 01.11.15
## from https://www.cs.upc.edu/~valiente/comput-biol/
##--------------------------------------------------------------------------------------------------------
quartet <- function (t,i,j,k,l)
{
unroot(drop.tip(t,setdiff(t$tip.label,c(i,j,k,l))))
}
quartets.distance <- function (t1,t2) {
L <- sort(t1$tip.label)
d <- 0
for (i in L[1:(length(L)-3)]) {
for (j in L[(match(i,L)+1):(length(L)-2)]) {
for (k in L[(match(j,L)+1):(length(L)-1)]) {
for (l in L[(match(k,L)+1):length(L)]) {
q1 <- quartet(t1,i,j,k,l)
q2 <- quartet(t2,i,j,k,l)
if (dist.topo(q1,q2) != 0) { d <- d + 2 }
}
}
}
}
d
}
quartets.distance.cmd <- function(otree, stree, PROG.QDIST='/apps/qdist/2.0/bin/qdist')
{
file1 <- paste( getwd(), '/tree1', sep='')
file2 <- paste( getwd(), '/tree2', sep='')
write.tree(otree,file=file1)
write.tree(stree,file=file2)
cmd <- paste(PROG.QDIST,file1,file2)
tmp <- system(cmd, intern=TRUE)
c('TAXA_NJ'=Ntip(otree), 'NQD'=as.numeric(tail(unlist(strsplit(tmp[2],'\t')),1)))
}
treedist.get.tree.100bs<- function(phr, phs, tmpdir)
{
require(ggtree)
require(phangorn)
# write to tmp directory
write.tree(phr, file=file.path(tmpdir,'phr.newick'))
invisible(sapply(seq_along(phs), function(i){ write.tree(phs[[i]], file=file.path(tmpdir, paste('phs',i,'.newick', sep=''))) }))
cmd <- paste('for i in $(seq 1 ',length(phs),'); do cat ',file.path(tmpdir,'phs$i.newick'),' >> ',file.path(tmpdir,'phc.newick'),'; done', sep='')
system(cmd)
cmd <- paste('cd ',tmpdir,'\nraxmlHPC-AVX -f b -t phr.newick -z phc.newick -m GTRCAT -s alg -n TEST', sep='')
system(cmd)
# prepare RAXML tree for plotting
raxml <- read.raxml(file.path(tmpdir,'RAxML_bipartitionsBranchLabels.TEST'))
pho <- raxml@phylo
tmp <- as.data.table(raxml@bootstrap)
z <- subset(tmp, bootstrap==100)[, {
#node <- 1597
desc <- Descendants(pho, node, type='all')
if( any(desc%in%subset(tmp, bootstrap<100)[, node]) )
desc<- NA_integer_
list(desc=desc)
}, by='node']
z <- subset(z, !is.na(desc))
z[, bootstrap:=100]
z[, node:=NULL]
setnames(z, 'desc', 'node')
z <- rbind(z, data.table(node= setdiff(seq.int(Nnode(pho, internal.only=FALSE)), z[, node]), bootstrap=0))
raxml@bootstrap <- as.data.frame(z)
# delete stuff
invisible(file.remove(list.files(tmpdir, full.names=TRUE)))
raxml
}
treedist.quartets.add<- function(submitted.info=NULL, ttrs=NULL, strs=NULL, file=NULL, with.save=0)
{
# file<- '/work/or105/Gates_2014/tree_comparison/submitted_151101.rda'
require(ape)
require(data.table)
stopifnot( !is.null(submitted.info) || !is.null(file))
if(is.null(submitted.info))
{
load(file)
with.save <- 1
}
#tmp <- subset(submitted.info, IDX==463)[1,]
#IDX<- 463
#IDX_T<-7
tmp <- submitted.info[, {
cat('\nAt IDX', IDX)
#IDX<- 241; TIME_IDX_T<- 6
stree <- unroot(strs_rtt[[IDX]])
otree <- unroot(ttrs[[TIME_IDX_T]])
#print(stree)
#print(otree)
z <- setdiff(otree$tip.label, stree$tip.label)
stopifnot( length(z)==abs(diff(c(Ntip(otree), Ntip(stree)))) )
if(length(z))
otree <- unroot(drop.tip(otree, z))
z <- quartets.distance.cmd(otree, stree)
list(TAXA_NJ=z['TAXA_NJ'], NQD=z['NQD'])
}, by='IDX']
submitted.info <- merge(submitted.info, tmp, by='IDX')
#
setkey(tinfo, IDX_T)
tmp <- subset(submitted.info, MODEL=='R')[, {
cat('\nAt IDX', IDX)
# IDX<- 1; SUB_IDX_T<- 1
stree <- strs_rtt[[IDX]]
otree <- ttrs[[SUB_IDX_T]]
# get all clusters of this true tree (with IDX_T) that are of size>=3 (use "tinfo" for that)
z <- subset(tinfo, CLU_N>3 & IDX_T==SUB_IDX_T)
setkey(z, TAXA)
z <- unique(z, by='TAXA')
z <- merge(z, data.table(TAXA=stree$tip.label, IN_STREE=1), by='TAXA', all.x=1)
# calculate the size of these clusters in the simulated tree
z <- merge(z, z[, list(CLU_NS= length(which(IN_STREE==1))), by='IDCLU'], by='IDCLU')
# get all clusters of size >= 3 in both the simulated and true tree
z <- subset(z, CLU_NS>3)
# if there any such clusters, calculate the quartet distance
if(nrow(z))
{
#IDCLU <- 6; TAXA <- subset(z, IDCLU==6)[, TAXA]
ans <- z[, {
sclu <- unroot(drop.tip(stree, setdiff(stree$tip.label,TAXA)))
oclu <- unroot(drop.tip(otree, union( setdiff(otree$tip.label, stree$tip.label), setdiff(otree$tip.label,TAXA))))
z <- quartets.distance.cmd(oclu, sclu)
list(NQDC=z['NQD'])
}, by='IDCLU']
}
if(!nrow(z))
ans <- data.table(IDCLU=NA_integer_, NQDC=NA_real_)
ans
}, by='IDX']
sclu.info <- merge(sclu.info, tmp, by=c('IDX','IDCLU'))
if(with.save)
save(strs, strs_rtt, ttrs, tinfo, tfiles, submitted.info, sclu.info, lba, file=gsub('\\.rda','_QD\\.rda',file))
}
##--------------------------------------------------------------------------------------------------------
## olli 13.07.16
##--------------------------------------------------------------------------------------------------------
treedist.closest.ind.obs<- function(tinfo, gd.thresh=Inf, rtn.pairs=FALSE)
{
tmp <- subset(tinfo, BRL_T=='subst')[, {
#z<- subset(tinfo, BRL_T=='subst' & IDX_T==1); IDPOP<- z$IDPOP; IDTR<- z$IDTR; IDREC<- z$IDREC; IDPOP_CL<- z$IDPOP_CL; IDPOP_CL_GD<- z$IDPOP_CL_GD
gds <- data.table(IDPOP=as.integer(gsub('IDPOP_','',IDPOP)), IDTR= as.integer(IDTR), IDREC=IDREC, IDPOP_CL=as.integer(IDPOP_CL), IDPOP_CL_GD=IDPOP_CL_GD)
gds[, CL_PH_PAIR:= gds[, as.character(as.numeric(IDPOP<IDPOP_CL))]]
tmp <- gds[, which(CL_PH_PAIR=='1')]
set(gds, tmp, 'CL_PH_PAIR', gds[tmp, paste(IDPOP,IDPOP_CL,sep=',')])
tmp <- gds[, which(CL_PH_PAIR=='0')]
set(gds, tmp, 'CL_PH_PAIR', gds[tmp, paste(IDPOP_CL,IDPOP,sep=',')])
setkey(gds, CL_PH_PAIR)
ans <- subset(gds, IDPOP_CL_GD<=gd.thresh)
# define tr->POP pairs
ans[, TR_PAIR:= ans[, as.character(as.numeric(IDPOP<IDTR))]]
tmp <- ans[, which(TR_PAIR=='1')]
set(ans, tmp, 'TR_PAIR', ans[tmp, paste(IDPOP,IDTR,sep=',')])
tmp <- ans[, which(TR_PAIR=='0')]
set(ans, tmp, 'TR_PAIR', ans[tmp, paste(IDTR,IDPOP,sep=',')])
# define POP->rec pairs
ans[, REC_PAIR:= ans[, as.character(as.numeric(IDPOP<IDREC))]]
tmp <- ans[, which(REC_PAIR=='1')]
set(ans, tmp, 'REC_PAIR', ans[tmp, paste(IDPOP,IDREC,sep=',')])
tmp <- ans[, which(REC_PAIR=='0')]
set(ans, tmp, 'REC_PAIR', ans[tmp, paste(IDREC,IDPOP,sep=',')])
# determine matches
ans[, IN:= CL_PH_PAIR==TR_PAIR]
tmp <- ans[, which(!is.na(IDREC) & !IN)]
set(ans, tmp, 'IN', ans[tmp, CL_PH_PAIR==REC_PAIR])
ans <- ans[, list(TRUE_PAIR=any(IN)), by='CL_PH_PAIR']
if(!rtn.pairs)
{
# calculate proportion of phylog closest pairs that are true transmission pairs
#ans <- list(TR_REC_perc_T= ans[, mean(as.numeric(TRUE_PAIR))] )
ans <- list(TPAIR_PHCL_T= ans[, length(which(TRUE_PAIR))], NTPAIR_PHCL_T= ans[, length(which(!TRUE_PAIR))])
}
ans
}, by='IDX_T']
setnames(tmp, 'IDX_T','SUB_IDX_T')
tmp
}
##--------------------------------------------------------------------------------------------------------
## olli 13.07.16
##--------------------------------------------------------------------------------------------------------
treedist.closest.ind.reconstructed<- function(submitted.info, tinfo, strs, gd.thresh)
{
sucl <- subset(submitted.info, MODEL=='R')[, {
print(IDX)
#IDX<- 557; SUB_IDX_T<-2; SC<- '150701_REGIONAL_TRAIN2'
ph <- strs[[IDX]]
model.reg <- grepl('REGIONAL',SC)
gds <- treedist.closest.ind(ph, model.reg)
gds <- subset(gds, IDPOP_CL_GD<=gd.thresh)
ans <- list(TPAIR_PHCL=NA_integer_, NTPAIR_PHCL=NA_integer_)
if(nrow(gds)>0)
{
tmp <- subset(tinfo, IDX_T==SUB_IDX_T, c(IDPOP, IDTR, IDREC))
ans <- merge(gds, tmp, by='IDPOP')
set(ans, NULL, 'IDPOP', ans[, as.integer(gsub('IDPOP_','',IDPOP))])
set(ans, NULL, 'IDPOP_CL', ans[, as.integer(gsub('IDPOP_','',IDPOP_CL))])
set(ans, NULL, 'IDTR', ans[, as.integer(IDTR)])
set(ans, NULL, 'IDREC', ans[, as.integer(IDREC)])
# get phylogenetically closest pairs
ans[, CL_PH_PAIR:= ans[, as.character(as.numeric(IDPOP<IDPOP_CL))]]
tmp <- ans[, which(CL_PH_PAIR=='1')]
set(ans, tmp, 'CL_PH_PAIR', ans[tmp, paste(IDPOP,IDPOP_CL,sep=',')])
tmp <- ans[, which(CL_PH_PAIR=='0')]
set(ans, tmp, 'CL_PH_PAIR', ans[tmp, paste(IDPOP_CL,IDPOP,sep=',')])
setkey(ans, CL_PH_PAIR)
# define tr->POP pairs
ans[, TR_PAIR:= ans[, as.character(as.numeric(IDPOP<IDTR))]]
tmp <- ans[, which(TR_PAIR=='1')]
set(ans, tmp, 'TR_PAIR', ans[tmp, paste(IDPOP,IDTR,sep=',')])
tmp <- ans[, which(TR_PAIR=='0')]
set(ans, tmp, 'TR_PAIR', ans[tmp, paste(IDTR,IDPOP,sep=',')])
# define POP->rec pairs
ans[, REC_PAIR:= ans[, as.character(as.numeric(IDPOP<IDREC))]]
tmp <- ans[, which(REC_PAIR=='1')]
set(ans, tmp, 'REC_PAIR', ans[tmp, paste(IDPOP,IDREC,sep=',')])
tmp <- ans[, which(REC_PAIR=='0')]
set(ans, tmp, 'REC_PAIR', ans[tmp, paste(IDREC,IDPOP,sep=',')])
# determine matches
ans[, IN:= CL_PH_PAIR==TR_PAIR]
tmp <- ans[, which(!is.na(IDREC) & !IN)]
set(ans, tmp, 'IN', ans[tmp, CL_PH_PAIR==REC_PAIR])
# calculate proportion of phylog closest pairs that are true transmission pairs
ans <- ans[, list(TRUE_PAIR=any(IN)), by='CL_PH_PAIR']
ans <- list(TPAIR_PHCL= ans[, length(which(TRUE_PAIR))], NTPAIR_PHCL= ans[, length(which(!TRUE_PAIR))])
}
ans
}, by=c('IDX')]
sucl
}
##--------------------------------------------------------------------------------------------------------
## olli 13.07.16
##--------------------------------------------------------------------------------------------------------
treedist.closest.ind.reconstructed.oftruepairs<- function(submitted.info, tinfo.pairs, strs)
{
tmp <- subset(submitted.info, MODEL=='R')[, {
print(IDX)
#IDX<- 557; SUB_IDX_T<-2; SC<- '150701_REGIONAL_TRAIN2'
ph <- strs[[IDX]]
model.reg <- grepl('REGIONAL',SC)
gds <- treedist.closest.ind(ph, model.reg)
gds <- subset(gds, IDPOP_CL_GD<=Inf)
# get correct phylogenetically closest pairs
z <- SUB_IDX_T
z <- subset(tinfo.pairs, SUB_IDX_T==z)
z <- subset(z, TRUE_PAIR_Inf)
# get phylogenetically closest pairs in simulation
ans <- copy(gds)
set(ans, NULL, 'IDPOP', ans[, as.integer(gsub('IDPOP_','',IDPOP))])
set(ans, NULL, 'IDPOP_CL', ans[, as.integer(gsub('IDPOP_','',IDPOP_CL))])
ans[, CL_PH_PAIR:= ans[, as.character(as.numeric(IDPOP<IDPOP_CL))]]
tmp <- ans[, which(CL_PH_PAIR=='1')]
set(ans, tmp, 'CL_PH_PAIR', ans[tmp, paste(IDPOP,IDPOP_CL,sep=',')])
tmp <- ans[, which(CL_PH_PAIR=='0')]
set(ans, tmp, 'CL_PH_PAIR', ans[tmp, paste(IDPOP_CL,IDPOP,sep=',')])
setkey(ans, CL_PH_PAIR)
# calculate which of the correct phylogenetically closest pairs are also in the simulation
ans <- merge(z, unique(ans, by='CL_PH_PAIR'), by='CL_PH_PAIR', all.x=1)
list(TR_PAIR_rec= ans[, mean(!is.na(IDPOP))] )
}, by=c('IDX')]
tmp
}
##--------------------------------------------------------------------------------------------------------
## olli 30.04.16
##--------------------------------------------------------------------------------------------------------
treedist.closest.ind<- function(ph, model.reg)
{
tmp <- cophenetic.phylo(ph)
diag(tmp) <- Inf
ans <- data.table(IDPOP=rownames(tmp), IDPOP_CL=colnames(tmp)[apply(tmp, 1, which.min)])
ans <- merge(ans, ans[, list(IDPOP_CL_GD=tmp[IDPOP, IDPOP_CL]), by='IDPOP'], by='IDPOP')
if( !model.reg )
{
set(ans, NULL, 'IDPOP', ans[, toupper(gsub('-FEMALE|-MALE','',sapply(strsplit(IDPOP,'_',fixed=1),'[[',1)))] )
set(ans, NULL, 'IDPOP_CL', ans[, toupper(gsub('-FEMALE|-MALE','',sapply(strsplit(IDPOP_CL,'_',fixed=1),'[[',1)))] )
}
if( model.reg )
{
set(ans, NULL, 'IDPOP', ans[, sapply(strsplit(IDPOP,'|',fixed=1),'[[',1)] )
set(ans, NULL, 'IDPOP_CL', ans[, sapply(strsplit(IDPOP_CL,'|',fixed=1),'[[',1)] )
}
ans
}
##--------------------------------------------------------------------------------------------------------
## olli 03.12.15
##--------------------------------------------------------------------------------------------------------
treedist.robinsonfould.wrapper<- function(submitted.info, ttrs, strs, check.binary.sim=TRUE)
{
setkey(submitted.info, IDX)
#tmp <- subset(submitted.info, IDX==321)[1,]
#IDX<- 321; TIME_IDX_T<-1
tmp <- submitted.info[, {
cat('\nAt IDX', IDX)
stree <- unroot(strs[[IDX]])
otree <- unroot(multi2di(ttrs[[TIME_IDX_T]], random=FALSE))
if(check.binary.sim && !is.binary.tree(stree))
{
cat('\nFound non-binary tree at IDX',IDX)
stree <- multi2di(stree, random=FALSE)
}
#print(stree)
#print(otree)
z <- setdiff(otree$tip.label, stree$tip.label)
stopifnot( length(z)==abs(diff(c(Ntip(otree), Ntip(stree)))) )
if(length(z))
otree <- unroot(drop.tip(otree, z))
#https://groups.google.com/forum/#!topic/raxml/JgvxgknTeqw
#normalize with 2n-6
rf <- RF.dist(otree, stree, check.labels=TRUE)
list(RF=rf, NRF=rf/(2*Ntip(otree)-6), TAXA_NJ=Ntip(otree))
}, by='IDX']
tmp
}
##--------------------------------------------------------------------------------------------------------
## olli 03.12.15
##--------------------------------------------------------------------------------------------------------
treedist.robinsonfouldclusters.wrapper<- function(submitted.info, ttrs, strs, tinfo)
{
#tmp <- subset(submitted.info, MODEL=='Model: Regional')[1,]
#IDX<- 1; TIME_IDX_T<-12
setkey(tinfo, IDX_T)
tmp <- subset(submitted.info, MODEL=='R')[, {
#IDX<- 208; TIME_IDX_T<- 16
cat('\nAt IDX', IDX)
stree <- strs[[IDX]]
otree <- ttrs[[TIME_IDX_T]]
z <- TIME_IDX_T
z <- subset(tinfo, CLU_N>3 & IDX_T==z)
setkey(z, TAXA)
z <- unique(z, by='TAXA')
z <- merge(z, data.table(TAXA=stree$tip.label, IN_STREE=1), by='TAXA', all.x=1)
z <- merge(z, z[, list(CLU_NS= length(which(IN_STREE==1))), by='IDCLU'], by='IDCLU')
z <- subset(z, CLU_NS>3)
if(nrow(z))
{
#IDCLU <- 6
#TAXA <- subset(z, IDCLU==6)[, TAXA]
ans <- z[, {
sclu <- unroot(drop.tip(stree, setdiff(stree$tip.label,TAXA)))
oclu <- unroot(drop.tip(otree, union( setdiff(otree$tip.label, stree$tip.label), setdiff(otree$tip.label,TAXA))))
rf <- RF.dist(oclu, sclu, check.labels=TRUE)
list(TAXA_NC=Ntip(oclu), RFC=rf, NRFC=rf/(2*Ntip(oclu)-6))
}, by='IDCLU']
}
if(!nrow(z))
ans <- data.table(IDCLU=NA_integer_, TAXA_NC=NA_integer_, RFC=NA_integer_, NRFC=NA_real_)
ans
}, by='IDX']
tmp
}
##--------------------------------------------------------------------------------------------------------
## olli 07.11.15
##--------------------------------------------------------------------------------------------------------
treedist.billera.add<- function(submitted.info=NULL, ttrs=NULL, strs=NULL, file=NULL, with.save=0)
{
# file<- '/work/or105/Gates_2014/tree_comparison/submitted_151101.rda'
require(ape)
require(data.table)
require(distory)
stopifnot( !is.null(submitted.info) || !is.null(file))
if(is.null(submitted.info))
{
load(file)
with.save <- 1
}
tmp <- submitted.info[, {
tmp <- lapply(IDX, function(i) strs[[i]]$tip.label)
list(POSTHOC_ROOT=Reduce(intersect, tmp)[1])
}, by='IDX_T']
submitted.info <- merge(submitted.info, tmp, by='IDX_T')
#tmp <- subset(submitted.info, IDX==65)[1,]
#IDX<- 65; IDX_T<-3; POSTHOC_ROOT<-'IDPOP_101537|F|DOB_2000.58|2019.48'
#POSTHOC_ROOT<-'HOUSE3326-7343-FEMALE_SAMPLED_30.4797372334259'
tmp <- submitted.info[, {
cat('\nFT: At IDX', IDX)
stree <- strs[[IDX]]
otree <- multi2di(ttrs[[IDX_T]], random=FALSE)
if(!is.binary.tree(stree))
{
cat('\nFound non-binary tree at IDX',IDX)
stree <- multi2di(stree, random=FALSE)
}
#print(stree)
#print(otree)
z <- setdiff(otree$tip.label, stree$tip.label)
stopifnot( length(z)==abs(diff(c(Ntip(otree), Ntip(stree)))) )
if(length(z))
otree <- drop.tip(otree, z)
otree <- root(otree, outgroup=POSTHOC_ROOT, resolve.root=TRUE)
stree <- root(stree, outgroup=POSTHOC_ROOT, resolve.root=TRUE)
tmp <- data.table(TAXA=otree$tip.label, TAXA_NEW=seq_len(Ntip(otree)))
otree$tip.label <- tmp[, TAXA_NEW]
setkey(tmp, TAXA)
stree$tip.label <- tmp[stree$tip.label, ][, TAXA_NEW]
tmp <- dist.multiPhylo( list(otree, stree) )[1]
list(BILL=tmp)
}, by='IDX']
submitted.info <- merge(submitted.info, tmp, by='IDX')
#
if(with.save)
save(strs, ttrs, tinfo, submitted.info, sclu.info, file=gsub('\\.rda','_BL\\.rda',file))
#
setkey(tinfo, IDX_T)
# IDX_T<- IDX<- 1
tmp <- subset(submitted.info, MODEL=='R')[, {
cat('\nCT: At IDX', IDX)
stree <- strs[[IDX]]
otree <- ttrs[[IDX_T]]
z <- IDX_T
z <- subset(tinfo, CLU_N>3 & IDX_T==z)
z <- merge(z, data.table(TAXA=stree$tip.label, IN_STREE=1), by='TAXA', all.x=1)
z <- merge(z, z[, list(CLU_NS= length(which(IN_STREE==1))), by='IDCLU'], by='IDCLU')
z <- subset(z, CLU_NS>3)
if(nrow(z))
{
#TAXA <- subset(z, IDCLU==22)[, TAXA]
ans <- z[, {
print(IDCLU)
sclu <- drop.tip(stree, setdiff(stree$tip.label,TAXA), rooted=TRUE)
oclu <- drop.tip(otree, union( setdiff(otree$tip.label, stree$tip.label), setdiff(otree$tip.label,TAXA)), rooted=TRUE)
tmp <- data.table(TAXA=oclu$tip.label, TAXA_NEW=seq_len(Ntip(oclu)))
oclu$tip.label <- tmp[, TAXA_NEW]
setkey(tmp, TAXA)
sclu$tip.label <- tmp[sclu$tip.label, ][, TAXA_NEW]
if(!is.rooted(sclu) | !is.rooted(oclu))
{
sclu <- root(sclu, outgroup='1',resolve.root=1)
oclu <- root(oclu, outgroup='1',resolve.root=1)
}
tmp <- dist.multiPhylo( list(oclu, sclu) )[1]
#print(tmp)
list(BILL=tmp)
}, by='IDCLU']
}
if(!nrow(z))
ans <- data.table(IDCLU=NA_integer_, BILL=NA_real_)
ans
}, by='IDX']
sclu.info <- merge(submitted.info, tmp, by='IDX')
if(with.save)
save(strs, ttrs, tinfo, submitted.info, sclu.info, file=gsub('\\.rda','_BL\\.rda',file))
}
##--------------------------------------------------------------------------------------------------------
## olli 19.11.15
##--------------------------------------------------------------------------------------------------------
treecomparison.submissions.151119<- function()
{
require(data.table)
require(ape)
require(phangorn)
#
# get true trees
#
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim_internal/freeze_July15'
tfiles <- list.files(indir, pattern='newick$', full.names=TRUE)
tfiles <- data.table( FILE_T= tfiles[ grepl('SUBSTTREE', tfiles) | grepl('Vill_99', tfiles) | grepl('Vill.*DATEDTREE', tfiles) ] )
tfiles[, SC:= toupper(gsub('_SUBSTTREE|_DATEDTREE','',gsub('.newick','',basename(FILE_T))))]
tmp <- rbind( subset(tfiles, SC=='VILL_99_APR15'), subset(tfiles, SC=='VILL_99_APR15'), subset(tfiles, SC=='VILL_99_APR15') )
set(tmp, NULL, 'SC', c('150701_VILL_SCENARIO-C','150701_VILL_SCENARIO-D','150701_VILL_SCENARIO-E'))
tfiles <- rbind(tfiles, tmp)
tmp <- list.files(indir, pattern='newick$', full.names=TRUE)
tmp <- data.table( FILE_T= tmp[ grepl('Reg.*DATEDTREE', tmp) ] )
tmp[, SC:= toupper(gsub('_SUBSTTREE|_DATEDTREE','',gsub('.newick','',basename(FILE_T))))]
tfiles <- rbind(tfiles, tmp)
tfiles[, BRL_T:= 'time']
set(tfiles, tfiles[, which(grepl('REG',SC) & grepl('SUBST',FILE_T))], 'BRL_T', 'subst')
ttrs <- lapply(tfiles[, FILE_T], function(x) read.tree(file=x) )
names(ttrs) <- tfiles[, SC]
for(z in c('VILL_99_APR15','150701_VILL_SCENARIO-C','150701_VILL_SCENARIO-D','150701_VILL_SCENARIO-E'))
ttrs[[z]] <- root(ttrs[[z]], node=Ntip(ttrs[[z]])+2, resolve.root=1)
tfiles[, IDX_T:=seq_along(ttrs)]
tfiles[, TAXAN_T:= sapply(ttrs, Ntip)]
# info on true trees
tinfo <- merge(tfiles, do.call('rbind',lapply(seq_along(ttrs), function(i) data.table(TAXA=ttrs[[i]]$tip.label, IDX_T=i))), by='IDX_T')
tinfo[, IDPOP:=NA_character_]
tmp <- tinfo[, which(grepl('REGIONAL',SC))]
set(tinfo, tmp, 'IDPOP', tinfo[tmp,regmatches(TAXA, regexpr('IDPOP_[0-9]+',TAXA))])
tmp <- tinfo[, which(!grepl('REGIONAL',SC))]
set(tinfo, tmp, 'IDPOP', tinfo[tmp, regmatches(TAXA, regexpr('HOUSE[0-9]+-[0-9]+|House[0-9]+-[0-9]+',TAXA))])
stopifnot(subset(tinfo, grepl('VILL',SC))[, length(which(substring(TAXA,1,10)!=substring(IDPOP,1,10)))]==0)
stopifnot( tinfo[, length(which(is.na(IDPOP)))==0] )
set(tinfo, NULL, 'IDPOP', tinfo[,toupper(IDPOP)])
set(tinfo, NULL, 'TAXA', tinfo[,toupper(TAXA)])
# read cluster membership from DATEDCLUTREES
tmp <- list.files(indir, pattern='DATEDCLUTREES', full.names=TRUE)
tmp <- data.table( FILE_CLU_T= tmp,
SC= toupper(gsub('_DATEDCLUTREES','',gsub('.newick','',basename(tmp)))),
BRL_T= 'time')
tfiles <- merge(tfiles, tmp, by=c('SC','BRL_T'), all=1)
tmp <- subset(tfiles, !is.na(FILE_CLU_T))[, {
z <- read.tree(FILE_CLU_T)
do.call('rbind',lapply(seq_along(z), function(i) data.table(IDCLU=i, TAXA=z[[i]]$tip.label)))
}, by=c('SC','BRL_T')]
tinfo <- merge(tinfo, tmp, by=c('SC','BRL_T','TAXA'), all=1)
tmp <- subset(tinfo, !is.na(IDCLU))[, list(CLU_N= length(IDPOP)), by=c('SC','BRL_T','IDCLU')]
tinfo <- merge(tinfo, tmp, by=c('SC','BRL_T','IDCLU'), all=1)
# read sequences and determine %gappiness
tmp <- list.files(indir, pattern='fa$|fasta$', full.names=TRUE)
tmp <- data.table( FILE_SEQ_T= tmp, SC= toupper(gsub('_SIMULATED','',gsub('.fa','',basename(tmp)))))
z <- subset(tmp, SC=='VILL_99_APR15')
set(z, NULL, 'SC', '150701_VILL_SCENARIO-C')
tmp <- rbind( tmp, z )
tfiles <- merge(tfiles, tmp, by='SC', all=1)
tmp <- subset(tfiles, !is.na(FILE_SEQ_T))[, {
z <- read.dna(FILE_SEQ_T, format='fasta')
ans <- sapply(seq_len(nrow(z)), function(i) base.freq(z[i,], all=1))
ans <- apply(ans[c('n','-','?'),], 2, sum)
list(TAXA=rownames(z), GPS=ans)
}, by=c('SC','BRL_T')]
tinfo <- merge(tinfo, tmp, by=c('SC','BRL_T','TAXA'), all.x=1)
#
# get submitted trees
#
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/IQTree/IQTree201507'
infiles <- list.files(indir, pattern='treefile$', recursive=1, full.names=1)
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/IQTree/IQTree201510'
infiles <- c(infiles, list.files(indir, pattern='treefile$', recursive=1, full.names=1))
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/PhyML'
infiles <- c(infiles, list.files(indir, pattern='*tree*', recursive=1, full.names=1))
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/RAxML'
infiles <- c(infiles, list.files(indir, pattern='*RAxML_bestTree*', recursive=1, full.names=1))
infiles <- c(infiles, list.files(indir, pattern="best_tree.newick", recursive=1, full.names=1))
infiles <- data.table(FILE=infiles)
strs <- lapply(infiles[, FILE], function(x)
{
cat(x)
read.tree(file=x)
})
names(strs) <- infiles[, FILE]
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/MetaPIGA'
tmp <- list.files(indir, pattern='*result*', recursive=1, full.names=1)
tmp <- data.table(FILE=tmp)
tmp.trees <- lapply(tmp[, FILE], function(x)
{
cat(x)
read.nexus(file=x)
})
sapply(tmp.trees, length)
MetaPIGA.trees <- c(lapply(tmp.trees, '[[', 1), lapply(tmp.trees, '[[', 2), lapply(tmp.trees, '[[', 3), lapply(tmp.trees, '[[', 4))
names(MetaPIGA.trees) <- c(sapply(tmp.trees, function(x) paste(names(x)[1],'_use',sep='')), sapply(tmp.trees, function(x) names(x)[2]), sapply(tmp.trees, function(x) names(x)[3]), sapply(tmp.trees, function(x) names(x)[4]))
names(MetaPIGA.trees) <- gsub("'",'',names(MetaPIGA.trees), fixed=1)
strs <- c(strs, MetaPIGA.trees)
submitted.info <- data.table(FILE=names(strs))
#
#
#
submitted.info[, IDX:=seq_along(strs)]
submitted.info[, TEAM:=NA_character_]
set(submitted.info, submitted.info[, which(grepl('RAXML|RAxML',FILE))], 'TEAM', 'RAXML')
set(submitted.info, submitted.info[, which(grepl('IQTree',FILE))], 'TEAM', 'IQTree')
set(submitted.info, submitted.info[, which(grepl('MetaPIGA|Consensus pruning|Best individual of population',FILE))], 'TEAM', 'MetaPIGA')
set(submitted.info, submitted.info[, which(grepl('PhyML',FILE))], 'TEAM', 'PhyML')
stopifnot( submitted.info[, length(which(is.na(TEAM)))==0] )
#
# scenario
#
submitted.info[, SC:=NA_character_]
tmp <- submitted.info[, which(grepl('150701_Regional_TRAIN[0-9]', FILE))]
set(submitted.info, tmp, 'SC', submitted.info[tmp, regmatches(FILE, regexpr('150701_Regional_TRAIN[0-9]',FILE))])
tmp <- submitted.info[, which(grepl('150701_Vill_SCENARIO-[A-Z]', FILE))]
set(submitted.info, tmp, 'SC', submitted.info[tmp, regmatches(FILE, regexpr('150701_Vill_SCENARIO-[A-Z]',FILE))])
tmp <- submitted.info[, which(is.na(SC) & grepl('TRAIN[0-9]', FILE))]
set(submitted.info, tmp, 'SC', submitted.info[tmp, paste('150701_Regional_',regmatches(FILE, regexpr('TRAIN[0-9]',FILE)),sep='')])
tmp <- submitted.info[, which(is.na(SC) & grepl('scenario[A-Z]', FILE))]
set(submitted.info, tmp, 'SC', submitted.info[tmp, paste('150701_Vill_',regmatches(FILE, regexpr('scenario[A-Z]',FILE)),sep='')])
tmp <- submitted.info[, which(is.na(SC) & grepl('150701_regional_train[0-9]', FILE))]
set(submitted.info, tmp, 'SC', submitted.info[tmp, regmatches(FILE, regexpr('150701_regional_train[0-9]',FILE))])
tmp <- submitted.info[, which(is.na(SC) & grepl('150701_vill_scenario-[A-Z]', FILE))]
set(submitted.info, tmp, 'SC', submitted.info[tmp, regmatches(FILE, regexpr('150701_vill_scenario-[A-Z]',FILE))])
tmp <- submitted.info[, which(is.na(SC) & grepl('Vill_99_Apr15', FILE))]
set(submitted.info, tmp, 'SC', 'Vill_99_Apr15')
set(submitted.info, NULL, 'SC', submitted.info[, toupper(SC)])
tmp <- submitted.info[, which(grepl('150701_VILL_SCENARIO[A-Z]', SC))]
set(submitted.info, tmp, 'SC', submitted.info[tmp, gsub('150701_VILL_SCENARIO','150701_VILL_SCENARIO-',SC)])
stopifnot( submitted.info[, length(which(is.na(SC)))==0] )
#
# set covariates of scenarios
#
tmp <- data.table( SC= c("150701_REGIONAL_TRAIN1","150701_REGIONAL_TRAIN2","150701_REGIONAL_TRAIN3","150701_REGIONAL_TRAIN4" ,"150701_REGIONAL_TRAIN5", "150701_VILL_SCENARIO-A", "150701_VILL_SCENARIO-B", "VILL_99_APR15","150701_VILL_SCENARIO-C", "150701_VILL_SCENARIO-D", "150701_VILL_SCENARIO-E"),
MODEL= c('R','R','R','R','R','V','V','V','V','V','V'),
SEQCOV= c(0.16, 0.16, 0.16, 0.16, 0.16, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6),
ACUTE= c('low', 'low', 'high', 'low', 'high', 'high', 'high', 'high', 'high', 'high', 'high'),
GAPS= c('none', 'low', 'low', 'high', 'high', 'low', 'high', 'none', 'none', 'low', 'high'),
ART= c('none', 'none', 'none', 'none', 'none', 'none', 'none', 'fast', 'fast', 'fast', 'fast'),
EXT= c('5pc', '5pc', '5pc', '5pc', '5pc', '~0pc', '~0pc', '~0pc', '~0pc', '~0pc', '~0pc')
)
submitted.info <- merge(submitted.info, tmp, by='SC')
#
# best tree for each scenario
#
submitted.info[, BEST:='N']
set(submitted.info, submitted.info[, which(grepl('RAxML', FILE) & grepl('best_tree', FILE))], 'BEST', 'Y')
# copied from ListOfBestTrees_IQTree150818.txt
# there are several best trees for some scenarios
tmp <- c( '150701_Vill_SCENARIO-A_IQTree150814_partition_12_3_07',
'150701_Vill_SCENARIO-A_IQTree150814_partition_12_3_04.',
'150701_Vill_SCENARIO-B_IQTree150814_partition_12_3_03.',
'Vill_99_Apr15_IQTree150814_partition_123.',
'150701_Vill_SCENARIO-D_IQTree150814_partition_12_3.',
'150701_Vill_SCENARIO-E_IQTree150814_partition_12_3.',
'150701_Vill_SCENARIO-A_IQTree150814_pol_partition_12_3.',
'150701_Vill_SCENARIO-B_IQTree150814_pol_partition_12_3_05.',
'Vill_99_Apr15_IQTree150814_pol_partition_12_3_09.',
'Vill_99_Apr15_IQTree150814_pol_partition_12_3_10.',
'150701_Vill_SCENARIO-D_IQTree150814_pol_partition_12_3_05.',
'150701_Vill_SCENARIO-D_IQTree150814_pol_partition_12_3_06.',
'150701_Vill_SCENARIO-D_IQTree150814_pol_partition_12_3_09.',
'150701_Vill_SCENARIO-E_IQTree150814_pol_partition_12_3_06.',
'150701_Regional_TRAIN1_IQTree150818_partition_123_03.',
'150701_Regional_TRAIN1_IQTree150818_pol_partition_123_05.')
tmp <- sapply(tmp, function(x) submitted.info[, which((grepl('IQTree150814/', FILE, fixed=1) | grepl('IQTree150818/', FILE, fixed=1)) & grepl(x, FILE, fixed=1))] )
set(submitted.info, tmp, 'BEST', 'Y')
tmp <- c('150701_Regional_TRAIN2_IQTree151019_partition_123_10',
'150701_Regional_TRAIN3_IQTree151019_partition_123_03',
'150701_Regional_TRAIN4_IQTree151019_partition_123_10',
'150701_Regional_TRAIN5_IQTree151019_partition_123_01',
'150701_Regional_TRAIN2_IQTree151019_pol_partition_123_08',
'150701_Regional_TRAIN3_IQTree151019_pol_partition_123_08',
'150701_Regional_TRAIN4_IQTree151019_pol_partition_123_05',
'150701_Regional_TRAIN5_IQTree151019_pol_partition_123_10')
tmp <- sapply(tmp, function(x) submitted.info[, which((grepl('IQTree151019', FILE, fixed=1)) & grepl(x, FILE, fixed=1))] )
set(submitted.info, tmp, 'BEST', 'Y')
# PhyML no replicates: all files best
set(submitted.info, submitted.info[, which(TEAM=='PhyML')], 'BEST', 'Y')
#
# set OTHER (ie old or some preliminary/unknown tree)
#
submitted.info[, OTHER:='N']
# MetaPIGA tree to be used is first in nexus list (which was tagged with best above)
set(submitted.info, submitted.info[, which(TEAM=='MetaPIGA' & !grepl('use', FILE))], 'OTHER', 'Y')
# IQTree did several uploads, use only most recent in main analysis
set(submitted.info, submitted.info[, which(grepl('150701_Regional_TRAIN1_IQTree150814', FILE))], 'OTHER', 'Y')
set(submitted.info, submitted.info[, which(TEAM=='IQTree' & MODEL=='R' & !grepl('TRAIN1', SC) & grepl('201507/',FILE,fixed=1))], 'OTHER', 'Y')
#
# set which gene used to construct tree (either pol or concatenated gag+pol+env)
#
submitted.info[, GENE:=NA_character_]
set(submitted.info, submitted.info[, which(TEAM=='RAXML' & grepl('full', FILE))], 'GENE', 'GAG+POL+ENV')
set(submitted.info, submitted.info[, which(TEAM=='RAXML' & grepl('pol', FILE))], 'GENE', 'POL')
stopifnot(nrow(subset(submitted.info, TEAM=='RAXML' & is.na(GENE)))==0)
set(submitted.info, submitted.info[, which(TEAM=='PhyML')], 'GENE', 'POL')
set(submitted.info, submitted.info[, which(TEAM=='MetaPIGA')], 'GENE', 'GAG+POL+ENV')
set(submitted.info, submitted.info[, which(TEAM=='IQTree' & grepl('[0-9]_partition', FILE))], 'GENE', 'GAG+POL+ENV')
set(submitted.info, submitted.info[, which(TEAM=='IQTree' & grepl('[0-9]_pol_partition', FILE))], 'GENE', 'POL')
stopifnot(nrow(subset(submitted.info, TEAM=='IQTree' & is.na(GENE)))==0)
#
# number taxa in tree
#
setkey(submitted.info, IDX)
submitted.info[, TAXAN:= sapply(strs, Ntip)]
#
# are trees rooted?
#
setkey(submitted.info, IDX)
submitted.info[, ROOTED:=factor(sapply(strs, is.rooted),levels=c(TRUE,FALSE),labels=c('Y','N'))]
#
# add BRL_UNITS
#
submitted.info[, BRL:='subst']
#
# add index of true tree
#
require(phangorn)
tmp <- subset(tfiles, select=c('SC','IDX_T','BRL_T'))
tmp <- dcast.data.table(tmp, SC~BRL_T, value.var='IDX_T')
setnames(tmp, c('subst','time'), c("SUB_IDX_T","TIME_IDX_T"))
submitted.info <- merge(submitted.info, tmp, by='SC')
submitted.info <- merge(submitted.info, unique(subset(tfiles, select=c('SC','TAXAN_T')), by=c('SC','TAXAN_T')), by='SC')
stopifnot(nrow(subset(submitted.info, TAXAN>TAXAN_T))==0)
#
# fix taxa names that teams have changed
#
tmp <- subset(submitted.info, TEAM=='IQTree' & MODEL=='R')[, IDX]
for(i in tmp)
{
strs[[i]]$tip.label <- sapply(strsplit(strs[[i]]$tip.label,'_'), function(x) paste(x[1],'_',x[2],'|',x[3],'|',x[4],'_',x[5],'|',x[6],sep='') )
}
for(i in seq_along(strs))
{
strs[[i]]$tip.label <- toupper(strs[[i]]$tip.label)
}
for(i in seq_along(ttrs))
{
ttrs[[i]]$tip.label <- toupper(ttrs[[i]]$tip.label)
}
###
tmp <- subset(submitted.info, TEAM=='PhyML' & MODEL=='R')[, IDX]
for(i in tmp)
{
z <- data.table(IDX=seq_along(strs[[i]]$tip.label), IDPOP=regmatches(strs[[i]]$tip.label, regexpr('IDPOP_[0-9]+',strs[[i]]$tip.label)), SC=subset(submitted.info, IDX==i)[,SC])
z <- merge(subset(tinfo, BRL_T=='time', select=c(IDPOP,SC,TAXA)), z, by=c('IDPOP','SC'))
setkey(z, IDX)
stopifnot(nrow(z)==Ntip(strs[[i]]))
strs[[i]]$tip.label <- z[, TAXA]
}
tmp <- subset(submitted.info, TEAM=='PhyML' & MODEL=='V')[, IDX]
for(i in tmp)
{
z <- data.table(IDX=seq_along(strs[[i]]$tip.label), IDPOP=regmatches(strs[[i]]$tip.label, regexpr('HOUSE[0-9]+-[0-9]+|House[0-9]+-[0-9]+',strs[[i]]$tip.label)), SC=subset(submitted.info, IDX==i)[,SC])
z <- merge(subset(tinfo, BRL_T=='time', select=c(IDPOP,SC,TAXA)), z, by=c('IDPOP','SC'))
stopifnot(nrow(z)==length(strs[[i]]$tip.label))
setkey(z, IDX)
strs[[i]]$tip.label <- z[, TAXA]
}
#
# compute Robinson Fould of complete tree
#
tmp <- treedist.robinsonfould.wrapper(submitted.info, ttrs, strs)
submitted.info <- merge(submitted.info, tmp, by='IDX')
# compute Robinson Fould of clusters, then take sum
tmp <- treedist.robinsonfouldclusters.wrapper(submitted.info, ttrs, strs, tinfo)
sclu.info <- merge(subset(submitted.info, select=c("IDX","SC","FILE","TEAM","MODEL","SEQCOV","ACUTE","GAPS","ART","EXT","BEST","OTHER","GENE","TAXAN","ROOTED","BRL","SUB_IDX_T","TIME_IDX_T","TAXAN_T")), tmp, by='IDX')
#
strs.new <- strs
ttrs.new <- ttrs
tinfo.new <- copy(tinfo)
submitted.info.new <- copy(submitted.info)
sclu.info.new <- copy(sclu.info)
outdir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/evaluation'
z <- load(paste(outdir, 'submitted_151119_S.rda', sep='/'))
stopifnot( nrow(subset(merge(subset(submitted.info, select=c('FILE','IDX')), subset(submitted.info.new, select=c('FILE','IDX')), by='FILE'), IDX.x!=IDX.y))==0 )
submitted.info <- merge(submitted.info.new, subset(submitted.info, select=c('IDX','NQD','lm_intercept','lm_slope','lm_rsq')), by='IDX')
strs <- strs.new
ttrs <- ttrs.new
sclu.info <- merge(sclu.info.new, subset(sclu.info, select=c('IDX','IDCLU','NQDC')), by=c('IDX','IDCLU'))
#
outfile <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/evaluation/submitted_151119_SRFQD.rda'
save(strs, strs_lsd_brl, strs_lsd_date, ttrs, tinfo, submitted.info, sclu.info, file=outfile)
}
##--------------------------------------------------------------------------------------------------------
## olli 25.07.16
##--------------------------------------------------------------------------------------------------------
treecomparison.explaingaps.bams.160817<- function()
{
require(scales)
require(ggplot2)
#
# collect information on batches
#
wdir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/explaingaps'
load(file.path(wdir, 'PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment.rda'))
load(file.path(wdir, 'PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment_logistic.rda'))
bami <- subset(dpand, PR=='1R' & POS=='PR_1' & !is.na(PANGEA_ID) & !is.na(SANGER_ID), select=c(PANGEA_ID, STUDY_ID, SANGER_ID))
# extract location of samples in batch
bami[, BATCH:= sapply(strsplit(SANGER_ID,'_'),'[[',1)]
bami[, LOC:= substr(gsub('PG[0-9]+-','',PANGEA_ID),1,2)]
bami <- bami[, list(LOC= paste(unique(LOC),collapse='-')), by='BATCH']
# extract significant batches
tmp <- subset(m2f.1.or, u95<0.95)
tmp[, PR:='2F']
tmp2 <- subset(m2r.1.or, u95<0.95)
tmp2[, PR:='2R']
tmp <- rbind(tmp, tmp2)
tmp <- subset(tmp, grepl('BATCH', COEF))
set(tmp, NULL, 'BATCH', tmp[,gsub('BATCH','',COEF)])
#
setkey(tmp, PR, OR)
ggplot(tmp, aes(x=BATCH, y=OR, ymin=l95, ymax=u95)) + geom_point() + geom_errorbar() + facet_grid(~PR) + coord_flip()
ggsave(file=file.path(wdir, 'PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment_batch_oddsratio.pdf'), w=7, h=10)
# batches are typically significant in both 2F and 2R
tmp2 <- dcast.data.table(tmp, BATCH~PR, value.var='OR')
setnames(tmp2, c('2F','2R'), c('OR_2F','OR_2R'))
bami <- merge(bami, tmp2, by='BATCH',all.x=1)
tmp2 <- dcast.data.table(tmp, BATCH~PR, value.var='u95')
setnames(tmp2, c('2F','2R'), c('ORu95_2F','ORu95_2R'))
bami <- merge(bami, tmp2, by='BATCH',all.x=1)
tmp2 <- dcast.data.table(tmp, BATCH~PR, value.var='l95')
setnames(tmp2, c('2F','2R'), c('ORl95_2F','ORl95_2R'))
bami <- merge(bami, tmp2, by='BATCH',all.x=1)
bami <- melt(bami, id.vars=c('BATCH','LOC'))
set(bami, bami[, which(is.na(value))],'value', 1)
bami <- dcast.data.table(bami, BATCH+LOC~variable, value.var='value')
#
# get length of short reads
#
infile <- '~/Dropbox (SPH Imperial College)/2015_PANGEA_DualPairsFromFastQIVA/readlengths/bam_stats_150218.rda'
load(infile)
bam.len[, BATCH:= sapply(strsplit(FILE,'_'),'[[',1)]
tmp <- subset(bam.len,QU>=40 & QU<320)[, list(CDF.bamlen= mean(CDF)), by=c('BATCH','QU')]
tmp <- merge(tmp, bami, by='BATCH')
ggplot(tmp, aes(y=CDFm, x=QU, colour=LOC, group=BATCH)) +
geom_line() +
#geom_boxplot() +
scale_y_continuous(labels=scales::percent, expand=c(0,0)) +
theme_bw() + labs(y='average frequency in run\n(cumulated)\n', x='\nlength of quality-trimmed short reads\n(nt)', fill='sequence run') +
theme(legend.position='bottom')
ggsave(file=file.path(wdir, 'PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment_bamlen_by_run.pdf'), w=14, h=7)
ggplot(tmp, aes(y=CDF.bamlen, x=QU, colour=OR_2F, group=BATCH)) +
geom_line() +
scale_x_continuous(breaks=seq(0,5e2,50), expand=c(0,0)) +
scale_colour_gradientn(colours=c('red','grey50')) +
scale_y_continuous(labels=scales::percent, expand=c(0,0)) +
theme_bw() + labs(y='average frequency of read length of individuals in batch\n(cumulated)\n', x='\nlength of quality-trimmed short reads\n(nt)', colour='odds ratio for batch effect\nin 2F primer analysis\n(0 is strong batch effect) ') +
theme(legend.position='bottom')
ggsave(file=file.path(wdir, 'PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment_bamlen_by_OR2F.pdf'), w=14, h=7)
ggplot(tmp, aes(y=CDF.bamlen, x=QU, colour=OR_2R, group=BATCH)) +
geom_line() +
scale_x_continuous(breaks=seq(0,5e2,50), expand=c(0,0)) +
scale_colour_gradientn(colours=c('red','grey50')) +
scale_y_continuous(labels=scales::percent, expand=c(0,0)) +
theme_bw() +
labs(y='average frequency of read length of individuals in batch\n(cumulated)\n', x='\nlength of quality-trimmed short reads\n(nt)', colour='odds ratio for batch effect\nin 2R primer analysis\n(0 is strong batch effect) ') +
theme(legend.position='bottom')
ggsave(file=file.path(wdir, 'PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment_bamlen_by_OR2R.pdf'), w=14, h=7)
#
# OK so those significant on the 2R primer are also a bit shorter than others
#
#
bam.cov[, BATCH:= sapply(strsplit(FILE_ID,'_'),'[[',1)]
bam.cov <- merge(bam.cov, bami, by='BATCH')
setkey(bam.cov, BATCH, FILE_ID, POS)
bam.covs<- lapply(bam.cov[, unique(BATCH)], function(batch)
{
cat('\nprocess run', batch)
tmp <- subset(bam.cov, BATCH==batch)
tmp <- tmp[, {
z <- rep(COV,REP)
list(COV=z, POS=seq_along(z), REF=REF[1])
}, by=c('FILE_ID','BATCH')]
tmp <- tmp[, list(QU=paste('QU',100*c(0.25, 0.5, 0.75),sep=''), COV=quantile(COV, p=c(0.25, 0.5, 0.75))), by=c('BATCH','REF','POS')]
tmp <- dcast.data.table(tmp, REF+BATCH+POS~QU, value.var='COV')
tmp
})
bam.covs<- do.call('rbind', bam.covs)
stopifnot( bam.covs[, length(unique(REF))]==1 ) #if not the same reference, then the positions do not share the same coordinate system
bam.covs<- merge(bam.covs, bami, by='BATCH')
ggplot(bam.covs, aes(y=QU50, x=POS, colour=OR_2R, group=BATCH)) +
geom_line() +
scale_colour_gradientn(colours=c('red','grey50')) +
scale_x_continuous(expand=c(0,0), breaks=seq(0,2e4,1e3)) +
scale_y_log10(breaks = scales::trans_breaks("log10", function(x) 10^x), labels = scales::trans_format("log10", scales::math_format(10^.x))) +
theme_bw() + labs(y='average number of reads per individual in batch\n', x='\nnt position in reference', colour='odds ratio for batch effect\nin 2R primer analysis\n(0 is strong batch effect) ') +
theme(legend.position='bottom')
ggsave(file=file.path(wdir, 'PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment_bamcov_by_OR2R.pdf'), w=14, h=7)
ggplot(bam.covs, aes(y=QU50, x=POS, colour=OR_2F, group=BATCH)) +
geom_line() +
scale_colour_gradientn(colours=c('red','grey50')) +
scale_x_continuous(expand=c(0,0), breaks=seq(0,2e4,1e3)) +
scale_y_log10(breaks = scales::trans_breaks("log10", function(x) 10^x), labels = scales::trans_format("log10", scales::math_format(10^.x))) +
theme_bw() + labs(y='average number of reads per individual in batch\n', x='\nnt position in reference', colour='odds ratio for batch effect\nin 2F primer analysis\n(0 is strong batch effect) ') +
theme(legend.position='bottom')
ggsave(file=file.path(wdir, 'PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment_bamcov_by_OR2F.pdf'), w=14, h=7)
tmp <- subset(bam.covs, POS>=1500 & POS<=4500)
tmp[, OR_2F:= factor(OR_2F<1, levels=c(TRUE,FALSE), labels=c('Yes','No'))]
tmp[, OR_2R:= factor(OR_2R<1, levels=c(TRUE,FALSE), labels=c('Yes','No'))]
tmp[, POSc:= as.numeric(as.character(cut(POS, breaks=seq(1500,4500,250), labels=seq(1501,4500,250))))]
tmp <- subset(tmp, !is.na(POSc))
set(tmp, NULL, 'POSc', tmp[, paste(POSc,'-',POSc+249,sep='')])
ggplot(tmp, aes(x=POSc, y=QU50, fill=OR_2F)) + geom_boxplot(outlier.shape=NA) +
scale_fill_manual(values=c('Yes'='red','No'='grey50')) +
scale_x_discrete(expand=c(0,0)) +
coord_trans(y='log10p1') +
scale_y_continuous(breaks=c(1,10,100, 1e3,1e4, 1e5)) +
theme_bw() +
labs(y='average number of reads per individual in batch\n', x='\nnt position in reference', fill='significant odds ratio for batch effect\nin 2F primer analysis ') +
theme(legend.position='bottom')
ggsave(file=file.path(wdir, 'PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment_bamcov_by_OR2F_boxpol.pdf'), w=12, h=6)
ggplot(tmp, aes(x=POSc, y=QU50, fill=OR_2R)) + geom_boxplot(outlier.shape=NA) +
scale_fill_manual(values=c('Yes'='red','No'='grey50')) +
scale_x_discrete(expand=c(0,0)) +
coord_trans(y='log10p1') +
scale_y_continuous(breaks=c(1,10,100, 1e3,1e4, 1e5)) +
theme_bw() +
labs(y='average number of reads per individual in batch\n', x='\nnt position in reference', fill='significant odds ratio for batch effect\nin 2R primer analysis ') +
theme(legend.position='bottom')
ggsave(file=file.path(wdir, 'PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment_bamcov_by_OR2R_boxpol.pdf'), w=12, h=6)
tmp <- copy(bam.covs)
tmp[, PLOT_ROW:= factor(POS<=4500, levels=c(TRUE,FALSE),labels=c('1st half of genome','2nd half of genome'))]
tmp[, OR_2F:= factor(OR_2F<1, levels=c(TRUE,FALSE), labels=c('Yes','No'))]
tmp[, OR_2R:= factor(OR_2R<1, levels=c(TRUE,FALSE), labels=c('Yes','No'))]
tmp[, POSc:= as.numeric(as.character(cut(POS, breaks=seq(0,9250,250), labels=seq(1,9250,250))))]
tmp <- subset(tmp, !is.na(POSc))
setkey(tmp, POSc)
tmp2 <- tmp[, unique(POSc)]
set(tmp, NULL, 'POSc', tmp[, factor(POSc, levels=tmp2, labels=paste(tmp2,'-',tmp2+249,sep=''))])
ggplot(tmp, aes(x=POSc, y=log10(QU50+.1), fill=OR_2R)) + geom_boxplot(outlier.shape=NA) +
scale_fill_manual(values=c('Yes'='red','No'='grey50')) +
scale_x_discrete(expand=c(0,0)) +
scale_y_continuous(breaks=c(0,1,2,3,4,5), labels=c(1,10,100, 1e3,1e4, 1e5)) +
theme_bw() +
facet_wrap(~PLOT_ROW, ncol=1, scale='free_x') +
labs(y='average number of reads per individual in batch\n', x='\nnt position in reference', fill='significant odds ratio for batch effect ') +
theme(legend.position='bottom')
ggsave(file=file.path(wdir, 'PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment_bamcov_by_OR2R_boxall.pdf'), w=16, h=12)
}
##--------------------------------------------------------------------------------------------------------
## olli 25.07.16
##--------------------------------------------------------------------------------------------------------
treecomparison.samplecharacteristics.160817<- function()
{
require(ape)
require(scales)
require(ggplot2)
require(data.table)
require(Hmisc)
wdir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/explaingaps'
wfile <- 'PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment.rda'
load(file.path(wdir,wfile))
#
# merge MRC Uganda
#
set(dm, dm[, which(grepl('MRC',COHORT))], 'COHORT', 'UG-MRC')
# use categorical values from UCL
set(dm, NULL, 'RECENTVL', dm[, as.numeric(RECENTVL)])
set(dm, dm[, which(is.na(RECENTVL) & RECENTVL_U<=1e4)], 'RECENTVL', 1e4-1)
set(dm, dm[, which(is.na(RECENTVL) & RECENTVL_U<=5e4)], 'RECENTVL', 5e4-1)
set(dm, dm[, which(is.na(RECENTVL) & RECENTVL_U<=1e5)], 'RECENTVL', 1e5-1)
set(dm, dm[, which(is.na(RECENTVL) & RECENTVL_U>1e5)], 'RECENTVL', 1e5+1)
stopifnot( !nrow(subset(dm, !is.na(RECENTVL_U) & is.na(RECENTVL))) )
#
# table on sequence characteristics by cohort
#
pngi <- dpand[, list(NT_DIFF_SUM=sum(NT_DIFF)), by=c('TAXA','PR')]
set(pngi, NULL, 'PR', pngi[, paste('NT_DIFF_',PR,sep='')])
set(pngi, pngi[, which(NT_DIFF_SUM>=1)], 'NT_DIFF_SUM', 1L)
set(pngi, pngi[, which(is.na(NT_DIFF_SUM))], 'NT_DIFF_SUM', 2L)
set(pngi, NULL, 'PR_NTDIFFc', pngi[, as.character(factor(NT_DIFF_SUM, levels=c(2,0,1), labels=c('unassembled','no mutation','at least one mutation')))])
pngi <- dcast.data.table(pngi, TAXA~PR, value.var='PR_NTDIFFc')
pngi <- merge(pngi, dm, by='TAXA')
# sex by cohort
set(pngi, pngi[, which(is.na(SEX))], 'SEX', 'missing')
pngs <- pngi[, {
z<- table(SEX)
list(STAT='SEX', N= as.numeric(z), LABEL= attr(z, 'dimnames')[[1]])
}, by='COHORT']
pngs <- merge(pngs, as.data.table(expand.grid(COHORT=pngs[, unique(COHORT)], STAT='SEX', LABEL= pngs[, unique(LABEL)], stringsAsFactors=FALSE)), by=c('COHORT','STAT','LABEL'), all=1)
set(pngs, NULL, 'LABEL', pngs[, factor(LABEL, levels=c("F", "M", "missing"))])
# if only sampling year known, set to midpoint
tmp <- pngi[, which(SAMPLEDATE==floor(SAMPLEDATE))]
set(pngi, tmp, 'SAMPLEDATE', pngi[tmp, SAMPLEDATE+.5])
# sampling years
pngi[, SAMPLEYR:= as.character(floor(SAMPLEDATE))]
set(pngi, pngi[, which(is.na(SAMPLEYR))], 'SAMPLEYR', 'missing')
tmp <- pngi[, {
z<- table(SAMPLEYR)
list(STAT='SAMPLEYR', N= as.numeric(z), LABEL= attr(z, 'dimnames')[[1]])
}, by='COHORT']
tmp <- merge(tmp, as.data.table(expand.grid(COHORT=tmp[, unique(COHORT)], STAT='SAMPLEYR', LABEL= tmp[, unique(LABEL)], stringsAsFactors=FALSE)), by=c('COHORT','STAT','LABEL'), all=1)
set(tmp, NULL, 'LABEL', tmp[, factor(LABEL, levels=c("2009","2010","2011", "2012", "2013","2014","missing"))])
pngs <- rbind(pngs, tmp)
# recent viral load around sampling time
pngi[, DUMMY:= 'missing']
tmp <- pngi[, which(!is.na(RECENTVLDATE) & !is.na(SAMPLEDATE) & abs(RECENTVLDATE-SAMPLEDATE)<1)]
#set(pngi, tmp, 'DUMMY', pngi[tmp, cut(as.numeric(RECENTVL), right=FALSE, breaks=c(-Inf, 1e4, 2e4, 4e4, 1e5, Inf), labels=c('<10,000','10,000-19,999','20,000-39,999','40,000-99,999','>=100,000'))])
set(pngi, tmp, 'DUMMY', pngi[tmp, cut(as.numeric(RECENTVL), right=FALSE, breaks=c(-Inf, 1e4, 5e4, 1e5, Inf), labels=c('<10,000','10,000-49,999','50,000-99,999','>=100,000'))])
tmp <- pngi[, {
z<- table(DUMMY)
list(STAT='RECENTVL', N= as.numeric(z), LABEL= attr(z, 'dimnames')[[1]])
}, by='COHORT']
tmp <- merge(tmp, as.data.table(expand.grid(COHORT=tmp[, unique(COHORT)], STAT='RECENTVL', LABEL= tmp[, unique(LABEL)], stringsAsFactors=FALSE)), by=c('COHORT','STAT','LABEL'), all=1)
#set(tmp, NULL, 'LABEL', tmp[, factor(LABEL, levels=c('<10,000','10,000-19,999','20,000-39,999','40,000-99,999','>=100,000',"missing"))])
set(tmp, NULL, 'LABEL', tmp[, factor(LABEL, levels=c('<10,000','10,000-49,999','50,000-99,999','>=100,000',"missing"))])
pngs <- rbind(pngs, tmp)
# ART
pngi[, DUMMY:= NULL]
pngi[, DUMMY:= 'N']
tmp <- pngi[, which(ARTSTART==floor(ARTSTART))]
set(pngi, tmp, 'ARTSTART', pngi[tmp, ARTSTART+.5])
tmp <- pngi[, which(ARTSTART<=SAMPLEDATE | PREVARTUSE=='Y' | COHORT=='BW-Mochudi' & CURRENTLYONART=='Y' | (everSelfReportArt==1 & FirstSelfReportArt<SAMPLEDATE))]
set(pngi, tmp, 'DUMMY', 'Y')
tmp <- pngi[, which( (COHORT=='RCCS' & (is.na(everSelfReportArt))) | #RCCS does not code CURRENTLYONART, and missing ARTSTART may indicate ART not yet started
(COHORT=='BW-Mochudi' & is.na(CURRENTLYONART)) | #BW codes already on ART
(COHORT=='AC_Resistance' & is.na(ARTSTART)) | #in the resistance cohort, there must be an ART start date
(COHORT=='UG-MRC' & is.na(CURRENTLYONART)))]
set(pngi, tmp, 'DUMMY', 'missing')
tmp <- pngi[, {
z<- table(DUMMY)
list(STAT='PREVARTATSAMPLING', N= as.numeric(z), LABEL= attr(z, 'dimnames')[[1]])
}, by='COHORT']
tmp <- merge(tmp, as.data.table(expand.grid(COHORT=tmp[, unique(COHORT)], STAT='PREVARTATSAMPLING', LABEL= tmp[, unique(LABEL)], stringsAsFactors=FALSE)), by=c('COHORT','STAT','LABEL'), all=1)
set(tmp, NULL, 'LABEL', tmp[, factor(LABEL, levels=c('Y','N',"missing"))])
pngs <- rbind(pngs, tmp)
# Age
tmp <- pngi[, which(is.na(AGE) & !is.na(DOB) & !is.na(SAMPLEDATE))]
set(pngi, tmp, 'AGE', pngi[tmp, SAMPLEDATE-DOB])
pngi[, DUMMY:= NULL]
pngi[, DUMMY:= 'missing']
tmp <- pngi[, which(!is.na(AGE))]
set(pngi, tmp, 'DUMMY', pngi[tmp, cut(AGE, breaks=c(-Inf, 25, 30, 35, 40, Inf), labels=c('<25','<30','<35','<40','>=40'))])
tmp <- pngi[, {
z<- table(DUMMY)
list(STAT='AGE', N= as.numeric(z), LABEL= attr(z, 'dimnames')[[1]])
}, by='COHORT']
tmp <- merge(tmp, as.data.table(expand.grid(COHORT=tmp[, unique(COHORT)], STAT='AGE', LABEL= tmp[, unique(LABEL)], stringsAsFactors=FALSE)), by=c('COHORT','STAT','LABEL'), all=1)
set(tmp, NULL, 'LABEL', tmp[, factor(LABEL, levels=c('<25','<30','<35','<40','>=40',"missing"))])
pngs <- rbind(pngs, tmp)
# Subtype
tmp <- pngi[, {
z<- table(COMET_CONS)
list(STAT='SUBTYPE', N= as.numeric(z), LABEL= attr(z, 'dimnames')[[1]])
}, by='COHORT']
tmp <- merge(tmp, as.data.table(expand.grid(COHORT=tmp[, unique(COHORT)], STAT='SUBTYPE', LABEL= tmp[, unique(LABEL)], stringsAsFactors=FALSE)), by=c('COHORT','STAT','LABEL'), all=1)
pngs <- rbind(pngs, tmp)
# Primer assembled
tmp <- melt(subset(pngi, select=c('COHORT',colnames(pngi)[grepl('NT_DIFF',colnames(pngi))])), id.vars='COHORT')
tmp <- tmp[, {
z<- table(value)
list(STAT='PRIMER', N= as.numeric(z), LABEL= attr(z, 'dimnames')[[1]])
}, by=c('COHORT','variable')]
set(tmp, NULL, 'STAT', tmp[, paste(STAT,gsub('NT_DIFF','',variable),sep='')])
tmp[, variable:=NULL]
pngs <- rbind(pngs, tmp)
#
# add proportions
#
set(pngs, pngs[, which(is.na(N))], 'N', 0)
tmp <- subset(pngs, STAT=='SEX')[, list(STAT='TOTAL', LABEL='TOTAL', N=sum(N)), by='COHORT']
pngs <- merge(pngs, dcast.data.table(tmp, COHORT~STAT, value.var='N'), by='COHORT')
pngs[, P:= 100*round(N/TOTAL, d=2)]
#
# make table
#
pngst <- dcast.data.table(pngs, STAT+LABEL~COHORT, value.var='P')
tmp <- subset(pngs, STAT=='SEX')[, list(STAT='TOTAL', LABEL='TOTAL', N=sum(N)), by='COHORT']
tmp <- dcast.data.table(tmp, STAT~COHORT, value.var='N')
tmp[, LABEL:='']
pngst <- rbind(subset(tmp, select=c('STAT', 'LABEL', 'AC_Resistance', 'BW-Mochudi', 'RCCS', 'UG-MRC')), pngst,use.names=TRUE,fill=TRUE)
#
save(pngi, pngs, pngst, file=file.path(wdir, gsub('\\.rda','_samplecharacteristics.rda', wfile)))
write.csv(pngst, row.names=FALSE, file=file.path(wdir, gsub('\\.rda','_samplecharacteristics.csv', wfile)))
#
# primers, assembled
#
tmp <- subset(pngs, grepl('PRIMER',STAT) & LABEL!='unassembled')[, list(LABEL=LABEL, PCA= round(N/sum(N),d=3)), by=c('STAT','COHORT')]
tmp <- dcast.data.table(tmp, STAT+LABEL~COHORT, value.var='PCA')
}
##--------------------------------------------------------------------------------------------------------
## olli 25.07.16
##--------------------------------------------------------------------------------------------------------
treecomparison.explaingaps.mutationspectrum.160725<- function()
{
require(ape)
require(scales)
require(data.table)
require(Hmisc)
wdir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/explaingaps'
wfile <- 'PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment.rda'
load(file.path(wdir, wfile))
#
# compare mutations of taxa that amplified to those that did not
#
z <- merge(dgd, dm, by='TAXA')
dsmut <- subset(z, GENE%in%c('1R-3F-firsthalf','1R-3F-secondhalf','2F-1R','2R-4F','4F-3R-firsthalf') & UNASS<0.1 , select=c(TAXA, GENE, COHORT))
dsmut[, TYPE:= 'Coverage_>90pc']
tmp <- subset(z, GENE%in%c('1R-3F-firsthalf','1R-3F-secondhalf','2F-1R','2R-4F','4F-3R-firsthalf') & UNASS>0.95 , select=c(TAXA, GENE, COHORT))
tmp[, TYPE:= 'Coverage_<5pc']
dsmut <- rbind(dsmut, tmp)
tmp <- subset(z, GENE%in%c('1R-3F-firsthalf','1R-3F-secondhalf','2F-1R','2R-4F','4F-3R-firsthalf') & UNASS>0.95 & COMET_Region1=='A1', select=c(TAXA, GENE, COHORT))
tmp[, TYPE:= 'Coverage_<5pc_A1']
dsmut <- rbind(dsmut, tmp)
tmp <- subset(z, GENE%in%c('1R-3F-firsthalf','1R-3F-secondhalf','2F-1R','2R-4F','4F-3R-firsthalf') & UNASS>0.95 & COMET_Region1=='D', select=c(TAXA, GENE, COHORT))
tmp[, TYPE:= 'Coverage_<5pc_D']
dsmut <- rbind(dsmut, tmp)
tmp <- subset(z, GENE%in%c('1R-3F-firsthalf','1R-3F-secondhalf','2F-1R','2R-4F','4F-3R-firsthalf') & UNASS>0.95 & COMET_Region1=='C', select=c(TAXA, GENE, COHORT))
tmp[, TYPE:= 'Coverage_<5pc_C']
dsmut <- rbind(dsmut, tmp)
# merge with primer mutations
dsmut <- merge(dsmut, subset(dpand, PR%in%c('1R','2F','2R','3F','4F')), by='TAXA',allow.cartesian=TRUE)
#
dsmut <- subset(dsmut, (GENE=='1R-3F-firsthalf' & PR=='2F') |
(GENE=='1R-3F-secondhalf' & PR=='2R') |
(GENE=='2F-1R' & PR=='1R') |
(GENE=='2R-4F' & PR=='3F') |
(GENE=='4F-3R-firsthalf' & PR=='4F'))
set(dsmut, NULL, 'POS', dsmut[,as.integer(gsub('PR_','',POS))])
set(dsmut, NULL, 'PR', dsmut[, paste('PR_',PR, sep='')])
# plot
tmp <- dsmut[, {
z <- round(as.numeric(binconf(length(which(NT_DIFF==1)), length(which(!is.na(NT_DIFF))))), d=3)
list(EST=c('central','l95','u95'), VAL= z)
}, by=c('GENE','TYPE','COHORT','PR','POS')]
tmp <- dcast.data.table(tmp, PR+POS+TYPE+COHORT~EST, value.var='VAL')
tmp <- merge(tmp, as.data.table(expand.grid(PR=tmp[, unique(PR)], POS=tmp[, unique(POS)], TYPE=tmp[, unique(TYPE)], COHORT=tmp[, unique(COHORT)], stringsAsFactors=FALSE)), by=c('PR','POS','TYPE','COHORT'), all=1, allow.cartesian=TRUE)
set(tmp, tmp[, which(is.na(central) | central<=0.001)],'l95',NA_real_)
set(tmp, tmp[, which(is.na(central) | central<=0.001)],'u95',NA_real_)
set(tmp, tmp[, which(is.na(central) | central<=0.001)],'central',0)
#set(tmp, NULL, 'TYPE', tmp[, factor(TYPE, levels=c("PANGEA_All","Rakai_All","Rakai_Coverage_>90pc","Rakai_Coverage_<5pc","Rakai_VL_<2e4_Coverage_>90pc", "Rakai_VL_>4e4_Coverage_<5pc","Rakai_D_Coverage_<5pc", "Rakai_A1_Coverage_<5pc", "Rakai_C_Coverage_<5pc"))])
#ggplot(subset(tmp, TYPE%in%c('PANGEA_All',"Rakai_Coverage_>90pc","Rakai_Coverage_<5pc","Rakai_D_Coverage_<5pc", "Rakai_A1_Coverage_<5pc", "Rakai_C_Coverage_<5pc")), aes(x=POS, fill=TYPE)) +
setkey(tmp, COHORT, TYPE)
set(tmp, NULL, 'COHORT', tmp[, factor(COHORT, levels=c("AC_Resistance","BW-Mochudi","RCCS","UG-MRC"), labels=c("South Africa\nResistance Cohort","Mochudi Prevention\nProject","Rakai Community\nCohort Study","Uganda\nMRC"))])
ggplot(subset(tmp, TYPE%in%c('Coverage_>90pc','Coverage_<5pc') & COHORT!="South Africa\nResistance Cohort"), aes(x=POS, fill=TYPE)) +
geom_bar(aes(y=central), stat='identity', width=0.7, position=position_dodge(0.8)) +
facet_grid(PR+COHORT~., scales='free_y') +
geom_errorbar(aes(ymin= l95, ymax=u95), position=position_dodge(0.8), width=0.3, size=0.4) +
theme_bw() + theme(legend.position='bottom') +
scale_x_continuous(breaks=tmp[, seq_len(max(POS))]) +
scale_y_continuous(labels=percent) +
scale_fill_brewer(palette='Set1') +
scale_alpha_manual(values=c('Coverage_<5pc'=1, 'Coverage_>90pc'=0.7)) +
#coord_cartesian(ylim=c(0,.2)) +
labs(x='\nNucleotide position in primer (always forward sense)', y='proportion of assembled sequences with mutation from primer\n', fill='group of\nsequences')
ggsave(file=file.path(wdir,gsub('.rda','_gapsprimers_1R2F2R3F4F_eval1.pdf',wfile)), h=25, w=10, limitsize = FALSE)
ggplot(subset(tmp, PR%in%c('PR_2F','PR_2R') & TYPE%in%c('Coverage_>90pc','Coverage_<5pc_A1','Coverage_<5pc_C','Coverage_<5pc_D') & COHORT!="Africa Centre\nResistance Cohort"), aes(x=POS, fill=TYPE)) +
geom_bar(aes(y=central), stat='identity', width=0.7, position=position_dodge(0.8)) +
facet_grid(PR+COHORT~., scales='free_y') +
geom_errorbar(aes(ymin= l95, ymax=u95), position=position_dodge(0.8), width=0.3, size=0.4) +
theme_bw() + theme(legend.position='bottom') +
scale_x_continuous(breaks=tmp[, seq_len(max(POS))]) +
scale_y_continuous(labels=percent) +
scale_fill_brewer(palette='Set1') +
#coord_cartesian(ylim=c(0,.2)) +
labs(x='\nNucleotide position in primer (always forward sense)', y='proportion of assembled sequences with mutation from primer\n', fill='group of\nsequences')
ggsave(file=file.path(wdir,gsub('.rda','_gapsprimers_2F2R_eval2.pdf',wfile)), h=10, w=10, limitsize = FALSE)
#
#
# old stuff
#
#
dsmut <- subset(dpand, PR=='1R' & POS=='PR_1' &
UNASS_HALF_INDIR_P<0.1 &
RECENTVL<2e4 & abs(SAMPLEDATE-RECENTVLDATE)<.5, select=c(TAXA, PR, UNASS_HALF_INDIR_P, UNASS_TO_NEXTPRIMER_P))
dsmut[, TYPE:= 'Rakai_VL_<2e4_Coverage_>90pc']
tmp <- subset(dpand, PR=='3F' & POS=='PR_1' &
UNASS_HALF_INDIR_P<0.1 &
RECENTVL<2e4 & abs(SAMPLEDATE-RECENTVLDATE)<.5, select=c(TAXA, PR, UNASS_HALF_INDIR_P, UNASS_TO_NEXTPRIMER_P))
tmp[, TYPE:= 'Rakai_VL_<2e4_Coverage_>90pc']
dsmut <- rbind(dsmut, tmp)
# high viral load and no coverage
tmp <- subset(dpand, PR=='1R' & POS=='PR_1' &
UNASS_HALF_INDIR_P>0.95 &
(is.na(ARTSTART) | SAMPLEDATE<ARTSTART) &
(!everSelfReportArt | everSelfReportArt & SAMPLEDATE<FirstSelfReportArt) &
RECENTVL>4e4 & abs(SAMPLEDATE-RECENTVLDATE)<.5, select=c(TAXA, PR, UNASS_HALF_INDIR_P, UNASS_TO_NEXTPRIMER_P))
tmp[, TYPE:= 'Rakai_VL_>4e4_Coverage_<5pc']
dsmut <- rbind(dsmut, tmp)
tmp <- subset(dpand, PR=='3F' & POS=='PR_1' &
UNASS_HALF_INDIR_P>0.95 &
(is.na(ARTSTART) | SAMPLEDATE<ARTSTART) &
(!everSelfReportArt | everSelfReportArt & SAMPLEDATE<FirstSelfReportArt) &
RECENTVL>4e4 & abs(SAMPLEDATE-RECENTVLDATE)<.5, select=c(TAXA, PR, UNASS_HALF_INDIR_P, UNASS_TO_NEXTPRIMER_P))
tmp[, TYPE:= 'Rakai_VL_>4e4_Coverage_<5pc']
dsmut <- rbind(dsmut, tmp)
# coverage
tmp <- subset(dpand, PR=='1R' & POS=='PR_1' & !is.na(STUDY_ID) &
UNASS_HALF_INDIR_P<0.1, select=c(TAXA, PR, UNASS_HALF_INDIR_P, UNASS_TO_NEXTPRIMER_P))
tmp[, TYPE:= 'Rakai_Coverage_>90pc']
dsmut <- rbind(dsmut, tmp)
tmp <- subset(dpand, PR=='1R' & POS=='PR_1' &
UNASS_HALF_INDIR_P>0.95, select=c(TAXA, PR, UNASS_HALF_INDIR_P, UNASS_TO_NEXTPRIMER_P))
tmp[, TYPE:= 'Rakai_Coverage_<5pc']
dsmut <- rbind(dsmut, tmp)
tmp <- subset(dpand, PR=='3F' & POS=='PR_1' & !is.na(STUDY_ID) &
UNASS_HALF_INDIR_P<0.1, select=c(TAXA, PR, UNASS_HALF_INDIR_P, UNASS_TO_NEXTPRIMER_P))
tmp[, TYPE:= 'Rakai_Coverage_>90pc']
dsmut <- rbind(dsmut, tmp)
tmp <- subset(dpand, PR=='3F' & POS=='PR_1' &
UNASS_HALF_INDIR_P>0.95, select=c(TAXA, PR, UNASS_HALF_INDIR_P, UNASS_TO_NEXTPRIMER_P))
tmp[, TYPE:= 'Rakai_Coverage_<5pc']
dsmut <- rbind(dsmut, tmp)
# population
tmp <- subset(dpand, PR=='1R' & POS=='PR_1' & !is.na(PANGEA_ID), select=c(TAXA, PR, UNASS_HALF_INDIR_P, UNASS_TO_NEXTPRIMER_P))
tmp[, TYPE:= 'PANGEA_All']
dsmut <- rbind(dsmut, tmp)
tmp <- subset(dpand, PR=='1R' & POS=='PR_1' & !is.na(STUDY_ID), select=c(TAXA, PR, UNASS_HALF_INDIR_P, UNASS_TO_NEXTPRIMER_P))
tmp[, TYPE:= 'Rakai_All']
dsmut <- rbind(dsmut, tmp)
tmp <- subset(dpand, PR=='3F' & POS=='PR_1' & !is.na(PANGEA_ID), select=c(TAXA, PR, UNASS_HALF_INDIR_P, UNASS_TO_NEXTPRIMER_P))
tmp[, TYPE:= 'PANGEA_All']
dsmut <- rbind(dsmut, tmp)
tmp <- subset(dpand, PR=='3F' & POS=='PR_1' & !is.na(STUDY_ID), select=c(TAXA, PR, UNASS_HALF_INDIR_P, UNASS_TO_NEXTPRIMER_P))
tmp[, TYPE:= 'Rakai_All']
dsmut <- rbind(dsmut, tmp)
# subtypes
tmp <- subset(dpand, PR=='1R' & POS=='PR_1' & COMET_Region1=='A1' &
UNASS_HALF_INDIR_P>0.95, select=c(TAXA, PR, UNASS_HALF_INDIR_P, UNASS_TO_NEXTPRIMER_P))
tmp[, TYPE:= 'Rakai_A1_Coverage_<5pc']
dsmut <- rbind(dsmut, tmp)
tmp <- subset(dpand, PR=='1R' & POS=='PR_1' & COMET_Region1=='C' &
UNASS_HALF_INDIR_P>0.95, select=c(TAXA, PR, UNASS_HALF_INDIR_P, UNASS_TO_NEXTPRIMER_P))
tmp[, TYPE:= 'Rakai_C_Coverage_<5pc']
dsmut <- rbind(dsmut, tmp)
tmp <- subset(dpand, PR=='1R' & POS=='PR_1' & COMET_Region1=='D' &
UNASS_HALF_INDIR_P>0.95, select=c(TAXA, PR, UNASS_HALF_INDIR_P, UNASS_TO_NEXTPRIMER_P))
tmp[, TYPE:= 'Rakai_D_Coverage_<5pc']
dsmut <- rbind(dsmut, tmp)
tmp <- subset(dpand, PR=='3F' & POS=='PR_1' & COMET_Region1=='A1' &
UNASS_HALF_INDIR_P>0.95, select=c(TAXA, PR, UNASS_HALF_INDIR_P, UNASS_TO_NEXTPRIMER_P))
tmp[, TYPE:= 'Rakai_A1_Coverage_<5pc']
dsmut <- rbind(dsmut, tmp)
tmp <- subset(dpand, PR=='3F' & POS=='PR_1' & COMET_Region1=='C' &
UNASS_HALF_INDIR_P>0.95, select=c(TAXA, PR, UNASS_HALF_INDIR_P, UNASS_TO_NEXTPRIMER_P))
tmp[, TYPE:= 'Rakai_C_Coverage_<5pc']
dsmut <- rbind(dsmut, tmp)
tmp <- subset(dpand, PR=='3F' & POS=='PR_1' & COMET_Region1=='D' &
UNASS_HALF_INDIR_P>0.95, select=c(TAXA, PR, UNASS_HALF_INDIR_P, UNASS_TO_NEXTPRIMER_P))
tmp[, TYPE:= 'Rakai_D_Coverage_<5pc']
dsmut <- rbind(dsmut, tmp)
tmp <- merge(subset(dpand, PR=='2F', select=c(TAXA, PR, POS, NT_DIFF, LOC, COMM_NUM, HH_NUM, SEX, AGE, COMET_Region1)), subset(dsmut, PR=='1R', c(TAXA, UNASS_HALF_INDIR_P, TYPE)), by=c('TAXA'), allow.cartesian=TRUE)
dsmut <- rbind(tmp, merge(subset(dpand, PR=='2R', select=c(TAXA, PR, POS, NT_DIFF, LOC, COMM_NUM, HH_NUM, SEX, AGE, COMET_Region1)), subset(dsmut, PR=='3F', c(TAXA, UNASS_HALF_INDIR_P, TYPE)), by=c('TAXA'), allow.cartesian=TRUE))
set(dsmut, NULL, 'POS', dsmut[,gsub('PR_','',POS)])
set(dsmut, NULL, 'PR', dsmut[, paste('PR_',PR, sep='')])
dsmut <- subset(dsmut, !is.na(POS))
#dcast.data.table(dsmut, TYPE+TAXA+PR1R_UNASS_TO_NEXTPRIMER_P+LOC+COMM_NUM+HH_NUM+SEX+AGE+COMET_Region1 ~ PR+POS, value.var='NT_DIFF')
tmp <- dsmut[,{
z <- round(as.numeric(binconf(length(which(NT_DIFF==1)), length(which(!is.na(NT_DIFF))))), d=3)
list(EST=c('central','l95','u95'), VAL= z)
}, by=c('TYPE','PR','POS')]
tmp <- dcast.data.table(tmp, PR+POS+TYPE~EST, value.var='VAL')
set(tmp, NULL, 'POS', tmp[, as.integer(POS)])
set(tmp, NULL, 'TYPE', tmp[, factor(TYPE, levels=c("PANGEA_All","Rakai_All","Rakai_Coverage_>90pc","Rakai_Coverage_<5pc","Rakai_VL_<2e4_Coverage_>90pc", "Rakai_VL_>4e4_Coverage_<5pc","Rakai_D_Coverage_<5pc", "Rakai_A1_Coverage_<5pc", "Rakai_C_Coverage_<5pc"))])
#ggplot(subset(tmp, TYPE%in%c('PANGEA_All',"Rakai_Coverage_>90pc","Rakai_Coverage_<5pc","Rakai_D_Coverage_<5pc", "Rakai_A1_Coverage_<5pc", "Rakai_C_Coverage_<5pc")), aes(x=POS, fill=TYPE)) +
ggplot(subset(tmp, TYPE%in%c('PANGEA_All',"Rakai_Coverage_>90pc","Rakai_Coverage_<5pc")), aes(x=POS, fill=TYPE)) +
geom_bar(aes(y=central), stat='identity', width=0.7, position=position_dodge(0.8)) +
facet_grid(PR~.) +
geom_linerange(aes(ymin= l95, ymax=u95), position=position_dodge(0.8)) +
theme_bw() + theme(legend.position='bottom') +
scale_x_continuous(breaks=tmp[, seq_len(max(POS))]) +
scale_y_continuous(labels=percent, expand=c(0,0)) +
coord_cartesian(ylim=c(0,.2)) +
labs(x='\nNucleotide position in primer\n(always forward sense)', y='PANGEA sequences with mutation from primer\n', fill='selected sequences')
ggsave(file=file.path(wdir,gsub('.rda','_gapsprimers_2F2R_eval1.pdf',wfile)), h=8, w=10, limitsize = FALSE)
ggplot(subset(tmp, TYPE%in%c('PANGEA_All',"Rakai_Coverage_<5pc", "Rakai_VL_>4e4_Coverage_<5pc")), aes(x=POS, fill=TYPE)) +
geom_bar(aes(y=central), stat='identity', width=0.7, position=position_dodge(0.8)) +
facet_grid(PR~.) +
geom_linerange(aes(ymin= l95, ymax=u95), position=position_dodge(0.8)) +
theme_bw() + theme(legend.position='bottom') +
scale_x_continuous(breaks=tmp[, seq_len(max(POS))]) +
scale_y_continuous(labels=percent, expand=c(0,0)) +
coord_cartesian(ylim=c(0,.5)) +
labs(x='\nNucleotide position in primer\n(always forward sense)', y='PANGEA sequences with mutation from primer\n', fill='selected sequences')
ggsave(file=file.path(wdir,gsub('.rda','_gapsprimers_2F2R_eval2.pdf',wfile)), h=10, w=12, limitsize = FALSE)
#
# plot selected data sets just to make sure I selected correctly
#
tmp <- unique(subset(dsmut, TYPE%in%c('Rakai_VL_high_Coverage_none','Rakai_VL_low_Coverage_high'), select=c(TAXA, TYPE)))
chr <- merge(ch, tmp, by='TAXA', all.x=1)
setkey(chr, TAXA)
tmp <- unique(chr, by='TAXA')
setkey(tmp, TYPE, COVP, TAXA)
tmp <- tmp[, list(TAXA=TAXA, PLOT=TYPE, PLOT_ID=seq_along(TAXA)), by='TYPE']
chr <- merge(chr, subset(tmp, select=c(TAXA, PLOT_ID)), by='TAXA')
ggplot(chr) +
geom_segment(aes(y=PLOT_ID, yend=PLOT_ID, x=POS_CH, xend=POS_CH+REP_CH-1L, colour=TYPE)) +
geom_rect(data=dpani, aes(xmin=START, xmax=END, ymin=-Inf, ymax=Inf), fill="black") +
facet_wrap(~TYPE, scales='free_y', ncol=6) +
scale_x_continuous(expand=c(0,0), breaks=dpani$START, labels=dpani$PR) +
scale_y_continuous(expand=c(0,0)) +
scale_colour_brewer(palette='Dark2') +
labs(x='\nalignment position', y='Rakai PANGEA-HIV sequences\n', colour='region') +
theme_bw() +
theme( legend.position='bottom', strip.text= element_blank(), strip.background=element_blank()) +
guides(colour=guide_legend(override.aes=list(size=5)))
}
##--------------------------------------------------------------------------------------------------------
## olli 25.07.16
##--------------------------------------------------------------------------------------------------------
treecomparison.explaingaps.regressions.170123<- function()
{
require(ape)
require(data.table)
wdir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/explaingaps'
wfile <- 'PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment.rda'
load(file.path(wdir, wfile))
#
# merge MRC Uganda
#
set(dm, NULL, 'COMM_NUM', dm[, as.character(COMM_NUM)])
tmp <- dm[, which(grepl('MRC',COHORT))]
set(dm, tmp, 'COMM_NUM', dm[tmp, COHORT])
#set(dm, tmp, 'COHORT', 'UG-MRC')
set(dm, dm[, which(COHORT%in%c('MRC-SI-MAS','MRC-SUP-INF'))], 'COHORT', 'UG-MRC-FSW')
set(dm, dm[, which(COHORT%in%c('MRC','MRC_GPC','MRC-FF-Nsazi'))], 'COHORT', 'UG-MRC-PC')
set(dm, dm[, which(COHORT%in%c('MRC Historical Data'))], 'COHORT', 'UG-MRC-Historic')
# calculate total mutational distance in all primers
dr <- dpand[, list(NT_DIFF_SUM=sum(NT_DIFF)), by=c('TAXA','PR')]
#dr[, length(TAXA), by='PR']
set(dr, NULL, 'PR', dr[, paste('NT_DIFF_',PR,sep='')])
# merge with gene regions
if(0)
{
tmp <- subset(dgd, grepl('half',GENE) | (!grepl('GAG|POL|ENV',GENE) & !grepl('half',GENE)))
set(tmp, NULL, c('START','END','ACTG'), NULL)
}
if(1)
{
tmp <- subset(dgd.seqs, grepl('half',GENE) | (!grepl('GAG|POL|ENV',GENE) & !grepl('half',GENE)))
set(tmp, NULL, c('SANGER_ID','LEN_RAW','PANGEA_ID'), NULL)
setnames(tmp, 'UNASS_RAW', 'UNASS')
tmp <- unique(tmp)
}
dr <- merge(dr, tmp, by='TAXA',allow.cartesian=TRUE)
set(dr, dr[, which(is.na(NT_DIFF_SUM))], 'NT_DIFF_SUM', -1)
set(dr, dr[, which(NT_DIFF_SUM>0)],'NT_DIFF_SUM', 1)
set(dr, NULL, 'NT_DIFF_SUM', dr[, as.character(factor(NT_DIFF_SUM, levels=c(-1,0,1), labels=c('not_assembled','no mutation','at_least_one_mutation')))])
# select only meaningful comparisons between primer and gene region
dr <- dcast.data.table(dr, TAXA+GENE+UNASS~PR, value.var='NT_DIFF_SUM')
dr <- subset(dr, GENE%in%c('START-2F','1R-3F','2R-4F','3R-END'))
tmp <- subset(dr, GENE=='START-2F', c(TAXA, GENE, UNASS, NT_DIFF_2F))
setnames(tmp, colnames(tmp)[4], c('FLANK_R'))
tmp2 <- subset(dr, GENE=='1R-3F', c(TAXA, GENE, UNASS, NT_DIFF_2F, NT_DIFF_2R))
setnames(tmp2, colnames(tmp2)[4:5], c('FLANK_L','FLANK_R'))
tmp <- rbind(tmp, tmp2, fill=TRUE)
tmp2 <- subset(dr, GENE=='2R-4F', c(TAXA, GENE, UNASS, NT_DIFF_3F, NT_DIFF_3R))
setnames(tmp2, colnames(tmp2)[4:5], c('FLANK_L','FLANK_R'))
tmp <- rbind(tmp, tmp2)
tmp2 <- subset(dr, GENE=='3R-END', c(TAXA, GENE, UNASS, NT_DIFF_3R))
setnames(tmp2, colnames(tmp2)[4], c('FLANK_L'))
dr <- rbind(tmp, tmp2, fill=TRUE)
dr[, FLANK:= 'no_mutation']
set(dr, dr[, which(FLANK_L=='not_assembled'|FLANK_R=='not_assembled')], 'FLANK', 'at_least_one_not_assembled')
set(dr, dr[, which(FLANK_L=='at_least_one_mutation'|FLANK_R=='at_least_one_mutation')], 'FLANK', 'at_least_one_mutation')
#
# prepare regression factors
#
tmp <- subset(dm, select=c(PANGEA_ID, TAXA, STUDY_ID, EXTRACT_ID, SANGER_ID, SAMPLEDATE, ARTSTART, selfReportArt, everSelfReportArt, FirstSelfReportArt, CURRENTLYONART, RECENTVL, RECENTVL_U, RECENTVL_L, RECENTVLDATE, COMET_1F1R, COMET_3F4F, COMET_4F3R, COMET_CONS, COMET_1F1R_N, COMET_3F4F_N, COMET_4F3R_N, COMET_CONS_N, COHORT, LOC, COMM_NUM))
set(tmp, NULL, 'RECENTVL', tmp[, as.numeric(RECENTVL)])
dr <- merge(dr, tmp, by='TAXA')
# batches
dr[, BATCH:=NA_character_]
tmp <- dr[, which(!is.na(SANGER_ID))]
set(dr, tmp, 'BATCH', dr[tmp, regmatches(SANGER_ID,regexpr('^[0-9]+', SANGER_ID))])
dr[, BATCH2:= as.integer(BATCH)]
# add locations to BATCH id
tmp <- dr[, list(BATCH_LABEL=paste(BATCH, ' (', paste(unique(COHORT),collapse='+'), ')',sep='')), by='BATCH']
dr <- merge(dr, tmp, by='BATCH')
dr[, BATCH:=NULL]
setnames(dr, 'BATCH_LABEL', 'BATCH')
# to every batch add one 0 and one 1
if(1)
{
tmp <- dr[, list(TAXA=c('PRIOR1','PRIOR2'), PANGEA_ID=c('PRIOR1','PRIOR2'), STUDY_ID=c('PRIOR1','PRIOR2'), SANGER_ID=c('PRIOR1','PRIOR2'), UNASS=c(0,1), FLANK='no_mutation', COHORT=COHORT[1] ), by=c('BATCH','GENE')]
#tmp <- dr[, list(TAXA=c('PRIOR1','PRIOR2','PRIOR3','PRIOR4','PRIOR5','PRIOR6'), PANGEA_ID=c('PRIOR1','PRIOR2','PRIOR3','PRIOR4','PRIOR5','PRIOR6'), STUDY_ID=c('PRIOR1','PRIOR2','PRIOR3','PRIOR4','PRIOR5','PRIOR6'), SANGER_ID=c('PRIOR1','PRIOR2','PRIOR3','PRIOR4','PRIOR5','PRIOR6'), UNASS=c(0,0,0,1,1,1), COHORT=COHORT[1] ), by=c('BATCH','PR')]
dr <- rbind(dr, tmp, use.names=TRUE, fill=TRUE)
}
# average viral load per batch
tmp <- dr[, {
ans <- NA_character_
tmp <- which((COHORT=='BW-Mochudi' & !is.na(RECENTVL)) | abs(RECENTVLDATE-SAMPLEDATE)<1)
if(length(tmp)<length(COHORT)*.5)
ans <- 'No VL measured'
tmp <- as.character(cut(median(RECENTVL[tmp]), breaks=c(0, 1e4, 2e4, 4e4, 1e5, Inf), labels=c('<1e4','1e4-2e4','2e4-4e4','4e4-1e5','>1e5')))
#tmp <- as.character(cut(median(RECENTVL[tmp]), breaks=c(0, 1e4, 5e4, 1e5, Inf), labels=c('<1e4','1e4-5e4','5e4-1e5','>1e5')))
if(!is.na(tmp) & is.na(ans))
ans <- tmp
list(BATCHVL= ans)
}, by=c('BATCH','GENE')]
dr <- merge(dr, tmp, by=c('BATCH','GENE'))
# ART status
dr[, ART:= as.integer(ARTSTART<SAMPLEDATE)]
set(dr, dr[, which(is.na(ART))], 'ART', 0L)
set(dr, dr[, which(ART==0 & everSelfReportArt==1 & SAMPLEDATE<FirstSelfReportArt)], 'ART', 1L)
set(dr, dr[, which(ART==0 & COHORT=='BW-Mochudi' & CURRENTLYONART=='Y')], 'ART', 1L)
set(dr, dr[, which(ART==0 & grepl('UG-MRC',COHORT) & CURRENTLYONART=='Y')], 'ART', 1L)
set(dr, dr[, which(ART==0 & COHORT=='AC_Resistance')], 'ART', 0L)
set(dr, NULL, 'ART', dr[, factor(ART, levels=c(0L,1L), labels=c('no ART', 'ART started or self reported'))])
# use categorical values from UCL
if(1)
{
set(dr, dr[, which(is.na(RECENTVL) & RECENTVL_U<=1e4)], 'RECENTVL', 1e4-1)
set(dr, dr[, which(is.na(RECENTVL) & RECENTVL_U<=5e4)], 'RECENTVL', 5e4-1)
set(dr, dr[, which(is.na(RECENTVL) & RECENTVL_U<=1e5)], 'RECENTVL', 1e5-1)
set(dr, dr[, which(is.na(RECENTVL) & RECENTVL_U>1e5)], 'RECENTVL', 1e5+1)
stopifnot( !nrow(subset(dr, !is.na(RECENTVL_U) & is.na(RECENTVL))) )
}
# recent viral load per individual
dr[, VL:='No VL measured']
tmp <- dr[, which((COHORT=='BW-Mochudi' & !is.na(RECENTVL)) | (!is.na(RECENTVL) & abs(RECENTVLDATE-SAMPLEDATE)<1))]
#set(dr, tmp, 'VL', dr[tmp, cut(RECENTVL, breaks=c(-1, 1e4, 2e4, 4e4, 1e5, Inf), labels=c('<1e4','1e4-2e4','2e4-4e4','4e4-1e5','>1e5'))])
set(dr, tmp, 'VL', dr[tmp, cut(RECENTVL, breaks=c(-1, 1e4, 5e4, 1e5, Inf), labels=c('<1e4','1e4-5e4','5e4-1e5','>1e5'))])
# primers
dr[, FLANK2:= FLANK]
tmp <- dr[, which(FLANK2=='at_least_one_mutation')]
set(dr, tmp, 'FLANK2', dr[tmp, paste0(GENE,'_',FLANK2)])
dr[, FLANK3:= FLANK]
set(dr, dr[, which(grepl('mutation',FLANK3))], 'FLANK3', 'assembled')
# region, community number, household number
tmp <- dr[, which(!grepl('PRIOR',TAXA) & is.na(LOC))]
set(dr, tmp, 'LOC', dr[tmp, COHORT])
set(dr, NULL, 'COMM_NUM', dr[, as.character(COMM_NUM)])
tmp <- dr[, which(!grepl('PRIOR',TAXA) & is.na(COMM_NUM))]
set(dr, tmp, 'COMM_NUM', dr[tmp, COHORT])
# extraction IDs
set(dr, NULL, 'EXTRACT_ID', dr[,as.integer(gsub('^0+','',EXTRACT_ID))])
# sample date
dr[, SAMPLEDATEc:= cut(SAMPLEDATE, breaks=seq(2009, 2015, 0.25), labels=seq(2009, 2015-0.25, 0.25))]
# amplicon
dr[, AMPLICON:= dr[,factor(GENE, levels=c('START-2F','1R-3F','2R-4F','3R-END'), labels=c('1','2','3','4'))]]
# extraction run
dr[, BAD_E_RUN:= factor(BATCH2%in%c(15833,15892,15915,15880,15878,16018,16019,16035,16033,15916,15910,15931,15934,15964,15952,16040,16056,15935,15949,15950,15958), levels=c(FALSE,TRUE),labels=c('N','Y'))]
#
# exclude AfricaCentre because pre-selected after amplification
#
dr <- subset(dr, COHORT!='AC_Resistance')
#
# re-level so that 'presumably good' factors are the reference
#
set(dr, NULL, 'AMPLICON', dr[, relevel(factor(AMPLICON), ref='1')])
set(dr, NULL, 'FLANK3', dr[, relevel(factor(FLANK3), ref='assembled')])
set(dr, NULL, 'FLANK2', dr[, relevel(factor(FLANK2), ref='no_mutation')])
set(dr, NULL, 'FLANK', dr[, relevel(factor(FLANK), ref='no_mutation')])
# defining the reference level for batches is tricky:
# if it s a very good batch, then this could simply be down to high VL
# if it s an RCCS batch with average viral load, then it could still be down to community?
# OK how about we take a batch that performs as well as a typical UG-MRC run, eg 66% on pol?
# --> this is '14683'
set(dr, NULL, 'BATCH', dr[, relevel(factor(BATCH), ref='14683 (BW-Mochudi)')])
set(dr, NULL, 'COHORT', dr[, relevel(factor(COHORT), ref='BW-Mochudi')])
#set(dr, NULL, 'COHORT', dr[, relevel(factor(COHORT), ref='AC_Resistance')])
set(dr, NULL, 'VL', dr[, relevel(factor(VL), ref='>1e5')])
set(dr, NULL, 'BATCHVL', dr[, relevel(factor(BATCHVL), ref='4e4-1e5')])
#set(dr, NULL, 'BATCHVL', dr[, relevel(factor(BATCHVL), ref='5e4-1e5')])
set(dr, NULL, 'ART', dr[, relevel(ART, ref='no ART')])
set(dr, NULL, 'BAD_E_RUN', dr[, relevel(BAD_E_RUN, ref='N')])
set(dr, NULL, 'COMET_1F1R', dr[, relevel(factor(COMET_1F1R), ref='A1')])
set(dr, NULL, 'COMET_3F4F', dr[, relevel(factor(COMET_3F4F), ref='A1')])
set(dr, NULL, 'COMET_4F3R', dr[, relevel(factor(COMET_4F3R), ref='A1')])
set(dr, NULL, 'COMET_CONS', dr[, relevel(factor(COMET_CONS), ref='A1')])
set(dr, NULL, 'LOC', dr[, relevel(factor(LOC), ref='13')])
#set(dr, NULL, 'COMM_NUM', dr[, relevel(factor(COMM_NUM), ref='106')])
set(dr, NULL, 'COMM_NUM', dr[, relevel(factor(COMM_NUM), ref='BW-Mochudi')])
set(dr, NULL, 'SAMPLEDATEc', dr[, relevel(factor(SAMPLEDATEc), ref='2010.25')])
#
# select data
#
ggplot(dr, aes(x=UNASS)) + geom_histogram() + facet_wrap(~GENE)
# use <60% vs >80%
dr[, ASS:= as.numeric(as.character(cut(UNASS, breaks=c(-1, 0.6, 0.8, 2), labels=c('1','0.5','0'))))]
#dr[, ASS:= as.numeric(as.character(cut(UNASS, breaks=c(-1, 0.4, 0.6, 2), labels=c('1','0.5','0'))))]
dr <- subset(dr, ASS!=.5)
dr[, UNASS:= 1-ASS]
#
# prepare data sets for logistic regression
#
drs <- subset(dr, select=c("ASS", "UNASS", "GENE", "TAXA", "AMPLICON", "EXTRACT_ID", "PANGEA_ID", "BATCH", "BATCH2", "ART", "BATCHVL", "VL", "COHORT", "LOC", "COMM_NUM", "COMET_1F1R", "COMET_3F4F", "COMET_4F3R", "COMET_CONS", "FLANK", "FLANK2", "FLANK3","BAD_E_RUN"))
drs12 <- subset(dr, GENE%in%c('START-2F','1R-3F'), select=c("ASS", "UNASS", "GENE", "TAXA", "AMPLICON", "EXTRACT_ID", "PANGEA_ID", "BATCH", "BATCH2", "ART", "BATCHVL", "VL", "COHORT", "LOC", "COMM_NUM", "COMET_1F1R", "COMET_3F4F", "COMET_4F3R", "COMET_CONS", "FLANK", "FLANK2", "FLANK3"))
drs23 <- subset(dr, GENE%in%c('1R-3F','2R-4F'), select=c("ASS", "UNASS", "GENE", "TAXA", "AMPLICON", "EXTRACT_ID", "PANGEA_ID", "BATCH", "BATCH2", "ART", "BATCHVL", "VL", "COHORT", "LOC", "COMM_NUM", "COMET_1F1R", "COMET_3F4F", "COMET_4F3R", "COMET_CONS", "FLANK", "FLANK2", "FLANK3"))
drs2 <- subset(dr, GENE%in%c('1R-3F'), select=c("ASS", "UNASS", "GENE", "TAXA", "AMPLICON", "EXTRACT_ID", "PANGEA_ID", "BATCH", "BATCH2", "ART", "BATCHVL", "VL", "COHORT", "LOC", "COMM_NUM", "COMET_1F1R", "COMET_3F4F", "COMET_4F3R", "COMET_CONS", "FLANK", "FLANK2", "FLANK3"))
drs1 <- subset(dr, GENE%in%c('START-2F'), select=c("ASS", "UNASS", "GENE", "TAXA", "AMPLICON", "EXTRACT_ID", "PANGEA_ID", "BATCH", "BATCH2", "ART", "BATCHVL", "VL", "COHORT", "LOC", "COMM_NUM", "COMET_1F1R", "COMET_3F4F", "COMET_4F3R", "COMET_CONS", "FLANK", "FLANK2", "FLANK3"))
drsnpr <- subset(drs, !grepl('PRIOR',TAXA))
drstnpr <- subset(dr, !grepl('PRIOR',TAXA), select=c("ASS", "UNASS", "GENE", "TAXA", "AMPLICON", "EXTRACT_ID", "PANGEA_ID", "BATCH", "BATCH2", "ART", "BATCHVL", "VL", "COHORT", "LOC", "COMM_NUM", "COMET_1F1R", "COMET_3F4F", "COMET_4F3R", "COMET_CONS", "FLANK", "FLANK2", "FLANK3", "SAMPLEDATEc", 'SAMPLEDATE'))
#
# identify sig plates
#
if(0)
{
m1 <- glm(data=drs, UNASS ~ VL + COHORT + AMPLICON + ART + BATCHVL + BATCH , family='binomial')
m1.or <- cbind(data.table(COEF=names(coef(m1))), as.data.table( exp(cbind(OR = coef(m1), confint(m1))) ) )
setnames(m1.or, c('2.5 %','97.5 %'), c('l95','u95'))
m1a <- glm(data=drs12, UNASS ~ VL + COHORT + AMPLICON + ART + BATCHVL + BATCH , family='binomial')
m1a.or <- cbind(data.table(COEF=names(coef(m1a))), as.data.table( exp(cbind(OR = coef(m1a), confint(m1a))) ) )
setnames(m1a.or, c('2.5 %','97.5 %'), c('l95','u95'))
m1b <- glm(data=drs1, UNASS ~ VL + COHORT + ART + BATCHVL + BATCH , family='binomial')
m1b.or <- cbind(data.table(COEF=names(coef(m1b))), as.data.table( exp(cbind(OR = coef(m1b), confint(m1b))) ) )
setnames(m1b.or, c('2.5 %','97.5 %'), c('l95','u95'))
m1c <- glm(data=drs2, UNASS ~ VL + COHORT + ART + BATCHVL + BATCH , family='binomial')
m1c.or <- cbind(data.table(COEF=names(coef(m1c))), as.data.table( exp(cbind(OR = coef(m1c), confint(m1c))) ) )
setnames(m1c.or, c('2.5 %','97.5 %'), c('l95','u95'))
m1d <- glm(data=drs23, UNASS ~ VL + COHORT + ART + BATCHVL + BATCH , family='binomial')
m1d.or <- cbind(data.table(COEF=names(coef(m1d))), as.data.table( exp(cbind(OR = coef(m1d), confint(m1d))) ) )
setnames(m1d.or, c('2.5 %','97.5 %'), c('l95','u95'))
sig.or <- m1.or
sig.or <- m1a.or
sig.or <- m1b.or
sig.or <- m1c.or
sig.or <- m1d.or
sig.plates <- subset(sig.or, l95>1 & grepl('BATCH[0-9]+',COEF))[, as.integer(regmatches(COEF, regexpr('[0-9]+',COEF)))]
sig.plates <- c(sig.plates, 16033, 15934)
}
m5 <- glm(data=drs, UNASS ~ VL + COHORT + AMPLICON + ART + BAD_E_RUN , family='binomial')
# BAD_E_RUN significant
sig.plates <- subset(dr, BAD_E_RUN=='Y')[, sort(unique(BATCH2))]
# sig.plates characteristics
length(sig.plates)
subset(dm, as.integer(gsub('_.*','',SANGER_ID))%in%sig.plates )
subset(dm, as.integer(gsub('_.*','',SANGER_ID))%in%sig.plates )[, table(COHORT)]
# plot significant batches
tmp <- subset(drstnpr, GENE=='1R-3F')
if(0)
{
tmp2 <- subset(sig.or, l95>1 & OR<=3 & grepl('BATCH',COEF))[, as.integer(regmatches(COEF, regexpr('[0-9]+',COEF)))]
tmp2 <- tmp[, which(BATCH2%in%tmp2)]
set(tmp, tmp2, 'BATCH', tmp[tmp2,paste(BATCH,' *',sep='')])
tmp2 <- subset(sig.or, l95>1 & OR>3 & OR<=10 & grepl('BATCH',COEF))[, as.integer(regmatches(COEF, regexpr('[0-9]+',COEF)))]
tmp2 <- tmp[, which(BATCH2%in%tmp2)]
set(tmp, tmp2, 'BATCH', tmp[tmp2,paste(BATCH,' **',sep='')])
tmp2 <- subset(sig.or, l95>1 & OR>10 & grepl('BATCH',COEF))[, as.integer(regmatches(COEF, regexpr('[0-9]+',COEF)))]
tmp2 <- tmp[, which(BATCH2%in%tmp2)]
set(tmp, tmp2, 'BATCH', tmp[tmp2,paste(BATCH,' ***',sep='')])
tmp2 <- tmp[, which(COHORT=='RCCS')]
set(tmp, tmp2, 'COMM_NUM', tmp[tmp2,paste('Rakai-',COMM_NUM,sep='')])
}
tmp2 <- tmp[, which(BATCH2%in%sig.plates)]
set(tmp, tmp2, 'BATCH', tmp[tmp2,paste(BATCH,' ***',sep='')])
tmp2 <- tmp[, list(EXTRACT_MED=median(EXTRACT_ID)), by=c('BATCH')]
setkey(tmp2, EXTRACT_MED)
set(tmp2, NULL, 'BATCH3', tmp2[, factor(EXTRACT_MED, levels=EXTRACT_MED, labels=BATCH)])
tmp <- tmp[, list(N=length(TAXA), P=mean(UNASS==1), SD= ifelse(all(is.na(SAMPLEDATE)), -1L, as.integer(mean(SAMPLEDATE, na.rm=1)<2012.25))), by=c('BATCH','BATCH2','COMM_NUM')]
tmp <- merge(tmp, tmp2, by='BATCH')
tmp <- subset(tmp, !COMM_NUM%in%c('Rakai-RCCS','MRC'))
ggplot( tmp, aes(x=BATCH3, y=COMM_NUM, size=N, colour=P)) +
geom_point() +
scale_colour_gradient(low='blue', high='orange') +
theme_bw() + theme(axis.text.x=element_text(angle=90, vjust=1)) +
labs(x='sequencing plate (ordered by sample extraction at UCLH)', y='sampling location', colour='proportion of\nsamples with\n<20% assembled sites\nin 1R-3F', size='number of\nsamples', pch='Average\nsample date\nbefore 2012.25')
ggsave(file=file.path(wdir, 'PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment_batch_vs_community.pdf'), w=25, h=10, useDingbats=FALSE)
#
# exclude sig plates on data without priors
#
m2 <- glm(data=subset(drsnpr,!BATCH2%in%sig.plates ), UNASS ~ VL + COHORT + AMPLICON + ART, family='binomial')
#m2 <- glm(data=subset(drsnpr,!BATCH2%in%sig.plates & VL!='No VL measured'), UNASS ~ VL + COHORT + AMPLICON + ART, family='binomial')
summary(m2)
m2.or <- cbind(data.table(COEF=names(coef(m2))), as.data.table( exp(cbind(OR = coef(m2), confint(m2))) ) )
setnames(m2.or, c('2.5 %','97.5 %'), c('l95','u95'))
#
# subtype on data without priors and without sig plates
#
drsnprs <- subset(drsnpr, COMET_CONS!='short' & !BATCH2%in%sig.plates)
m2.5.CONS <- glm(data=drsnprs, UNASS ~ VL + COHORT + ART + AMPLICON + COMET_CONS, family='binomial')
summary(m2.5.CONS)
tmp <- m2.5.CONS
m2.5.CONS.or<- cbind(data.table(COEF=names(coef(tmp))), as.data.table( exp(cbind(OR = coef(tmp), confint(tmp))) ) )
setnames(m2.5.CONS.or, c('2.5 %','97.5 %'), c('l95','u95'))
#
# write odds ratio tables
#
tmp <- copy(m2.or)
set(tmp, NULL, 'OR', tmp[, round(OR,d=2)])
set(tmp, NULL, 'CI', tmp[, paste(round(l95,d=2),'-',round(u95,d=2),sep='')])
write.csv(subset(tmp, !is.na(OR), select=c(COEF, OR, CI)), row.names=FALSE, file=file.path(wdir, 'PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment_regression_model12.1e.csv'))
tmp <- copy(m2.5.CONS.or)
set(tmp, NULL, 'OR', tmp[, round(OR,d=2)])
set(tmp, NULL, 'CI', tmp[, paste(round(l95,d=2),'-',round(u95,d=2),sep='')])
write.csv(subset(tmp, !is.na(OR), select=c(COEF, OR, CI)), row.names=FALSE, file=file.path(wdir, 'PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment_regression_model2.5.CONS.csv'))
}
##--------------------------------------------------------------------------------------------------------
## olli 25.07.16
##--------------------------------------------------------------------------------------------------------
treecomparison.explaingaps.regressions.170110<- function()
{
require(ape)
require(data.table)
wdir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/explaingaps'
wfile <- 'PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment.rda'
load(file.path(wdir, wfile))
#
# merge MRC Uganda
#
set(dm, NULL, 'COMM_NUM', dm[, as.character(COMM_NUM)])
tmp <- dm[, which(grepl('MRC',COHORT))]
set(dm, tmp, 'COMM_NUM', dm[tmp, COHORT])
set(dm, tmp, 'COHORT', 'UG-MRC')
# calculate total mutational distance in all primers
dr <- dpand[, list(NT_DIFF_SUM=sum(NT_DIFF)), by=c('TAXA','PR')]
#dr[, length(TAXA), by='PR']
set(dr, NULL, 'PR', dr[, paste('NT_DIFF_',PR,sep='')])
# merge with gene regions
tmp <- subset(dgd, grepl('half',GENE) | (!grepl('GAG|POL|ENV',GENE) & !grepl('half',GENE)))
set(tmp, NULL, c('START','END','ACTG'), NULL)
dr <- merge(dr, tmp, by='TAXA',allow.cartesian=TRUE)
set(dr, dr[, which(is.na(NT_DIFF_SUM))], 'NT_DIFF_SUM', -1)
set(dr, dr[, which(NT_DIFF_SUM>0)],'NT_DIFF_SUM', 1)
set(dr, NULL, 'NT_DIFF_SUM', dr[, as.character(factor(NT_DIFF_SUM, levels=c(-1,0,1), labels=c('not_assembled','no mutation','at_least_one_mutation')))])
# select only meaningful comparisons between primer and gene region
dr <- dcast.data.table(dr, TAXA+GENE+UNASS~PR, value.var='NT_DIFF_SUM')
dr <- subset(dr, GENE%in%c('2F-1R','1R-3F','2R-4F','4F-3R'))
tmp <- subset(dr, GENE=='2F-1R', c(TAXA, GENE, UNASS, NT_DIFF_2F, NT_DIFF_1R))
setnames(tmp, colnames(tmp)[4:5], c('FLANK_L','FLANK_R'))
tmp2 <- subset(dr, GENE=='1R-3F', c(TAXA, GENE, UNASS, NT_DIFF_2F, NT_DIFF_2R))
setnames(tmp2, colnames(tmp2)[4:5], c('FLANK_L','FLANK_R'))
tmp <- rbind(tmp, tmp2)
tmp2 <- subset(dr, GENE=='2R-4F', c(TAXA, GENE, UNASS, NT_DIFF_3F, NT_DIFF_3R))
setnames(tmp2, colnames(tmp2)[4:5], c('FLANK_L','FLANK_R'))
tmp <- rbind(tmp, tmp2)
tmp2 <- subset(dr, GENE=='4F-3R', c(TAXA, GENE, UNASS, NT_DIFF_4F, NT_DIFF_3R))
setnames(tmp2, colnames(tmp2)[4:5], c('FLANK_L','FLANK_R'))
dr <- rbind(tmp, tmp2)
dr[, FLANK:= 'no_mutation']
set(dr, dr[, which(FLANK_L=='not_assembled'|FLANK_R=='not_assembled')], 'FLANK', 'at_least_one_not_assembled')
set(dr, dr[, which(FLANK_L=='at_least_one_mutation'|FLANK_R=='at_least_one_mutation')], 'FLANK', 'at_least_one_mutation')
#
# prepare regression factors
#
tmp <- subset(dm, select=c(PANGEA_ID, TAXA, STUDY_ID, EXTRACT_ID, SANGER_ID, SAMPLEDATE, ARTSTART, selfReportArt, everSelfReportArt, FirstSelfReportArt, CURRENTLYONART, RECENTVL, RECENTVL_U, RECENTVL_L, RECENTVLDATE, COMET_1F1R, COMET_3F4F, COMET_4F3R, COMET_CONS, COMET_1F1R_N, COMET_3F4F_N, COMET_4F3R_N, COMET_CONS_N, COHORT, LOC, COMM_NUM))
set(tmp, NULL, 'RECENTVL', tmp[, as.numeric(RECENTVL)])
dr <- merge(dr, tmp, by='TAXA')
# batches
dr[, BATCH:=NA_character_]
tmp <- dr[, which(!is.na(SANGER_ID))]
set(dr, tmp, 'BATCH', dr[tmp, regmatches(SANGER_ID,regexpr('^[0-9]+', SANGER_ID))])
dr[, BATCH2:= as.integer(BATCH)]
# add locations to BATCH id
tmp <- dr[, list(BATCH_LABEL=paste(BATCH, ' (', paste(unique(COHORT),collapse='+'), ')',sep='')), by='BATCH']
dr <- merge(dr, tmp, by='BATCH')
dr[, BATCH:=NULL]
setnames(dr, 'BATCH_LABEL', 'BATCH')
# to every batch add one 0 and one 1
if(1)
{
tmp <- dr[, list(TAXA=c('PRIOR1','PRIOR2'), PANGEA_ID=c('PRIOR1','PRIOR2'), STUDY_ID=c('PRIOR1','PRIOR2'), SANGER_ID=c('PRIOR1','PRIOR2'), UNASS=c(0,1), FLANK='no_mutation', COHORT=COHORT[1] ), by=c('BATCH','GENE')]
#tmp <- dr[, list(TAXA=c('PRIOR1','PRIOR2','PRIOR3','PRIOR4','PRIOR5','PRIOR6'), PANGEA_ID=c('PRIOR1','PRIOR2','PRIOR3','PRIOR4','PRIOR5','PRIOR6'), STUDY_ID=c('PRIOR1','PRIOR2','PRIOR3','PRIOR4','PRIOR5','PRIOR6'), SANGER_ID=c('PRIOR1','PRIOR2','PRIOR3','PRIOR4','PRIOR5','PRIOR6'), UNASS=c(0,0,0,1,1,1), COHORT=COHORT[1] ), by=c('BATCH','PR')]
dr <- rbind(dr, tmp, use.names=TRUE, fill=TRUE)
}
# average viral load per batch
tmp <- dr[, {
ans <- NA_character_
tmp <- which((COHORT=='BW-Mochudi' & !is.na(RECENTVL)) | abs(RECENTVLDATE-SAMPLEDATE)<1)
if(length(tmp)<length(COHORT)*.5)
ans <- 'No VL measured'
tmp <- as.character(cut(median(RECENTVL[tmp]), breaks=c(0, 1e4, 2e4, 4e4, 1e5, Inf), labels=c('<1e4','1e4-2e4','2e4-4e4','4e4-1e5','>1e5')))
#tmp <- as.character(cut(median(RECENTVL[tmp]), breaks=c(0, 1e4, 5e4, 1e5, Inf), labels=c('<1e4','1e4-5e4','5e4-1e5','>1e5')))
if(!is.na(tmp) & is.na(ans))
ans <- tmp
list(BATCHVL= ans)
}, by=c('BATCH','GENE')]
dr <- merge(dr, tmp, by=c('BATCH','GENE'))
# ART status
dr[, ART:= as.integer(ARTSTART<SAMPLEDATE)]
set(dr, dr[, which(is.na(ART))], 'ART', 0L)
set(dr, dr[, which(ART==0 & everSelfReportArt==1 & SAMPLEDATE<FirstSelfReportArt)], 'ART', 1L)
set(dr, dr[, which(ART==0 & COHORT=='BW-Mochudi' & CURRENTLYONART=='Y')], 'ART', 1L)
set(dr, dr[, which(ART==0 & COHORT=='UG-MRC' & CURRENTLYONART=='Y')], 'ART', 1L)
set(dr, dr[, which(ART==0 & COHORT=='AC_Resistance')], 'ART', 0L)
set(dr, NULL, 'ART', dr[, factor(ART, levels=c(0L,1L), labels=c('no ART', 'ART started or self reported'))])
# use categorical values from UCL
if(1)
{
set(dr, dr[, which(is.na(RECENTVL) & RECENTVL_U<=1e4)], 'RECENTVL', 1e4-1)
set(dr, dr[, which(is.na(RECENTVL) & RECENTVL_U<=5e4)], 'RECENTVL', 5e4-1)
set(dr, dr[, which(is.na(RECENTVL) & RECENTVL_U<=1e5)], 'RECENTVL', 1e5-1)
set(dr, dr[, which(is.na(RECENTVL) & RECENTVL_U>1e5)], 'RECENTVL', 1e5+1)
stopifnot( !nrow(subset(dr, !is.na(RECENTVL_U) & is.na(RECENTVL))) )
}
# recent viral load per individual
dr[, VL:='No VL measured']
tmp <- dr[, which((COHORT=='BW-Mochudi' & !is.na(RECENTVL)) | (!is.na(RECENTVL) & abs(RECENTVLDATE-SAMPLEDATE)<1))]
#set(dr, tmp, 'VL', dr[tmp, cut(RECENTVL, breaks=c(-1, 1e4, 2e4, 4e4, 1e5, Inf), labels=c('<1e4','1e4-2e4','2e4-4e4','4e4-1e5','>1e5'))])
set(dr, tmp, 'VL', dr[tmp, cut(RECENTVL, breaks=c(-1, 1e4, 5e4, 1e5, Inf), labels=c('<1e4','1e4-5e4','5e4-1e5','>1e5'))])
# primers
dr[, FLANK2:= FLANK]
tmp <- dr[, which(FLANK2=='at_least_one_mutation')]
set(dr, tmp, 'FLANK2', dr[tmp, paste0(GENE,'_',FLANK2)])
dr[, FLANK3:= FLANK]
set(dr, dr[, which(grepl('mutation',FLANK3))], 'FLANK3', 'assembled')
# region, community number, household number
tmp <- dr[, which(!grepl('PRIOR',TAXA) & is.na(LOC))]
set(dr, tmp, 'LOC', dr[tmp, COHORT])
set(dr, NULL, 'COMM_NUM', dr[, as.character(COMM_NUM)])
tmp <- dr[, which(!grepl('PRIOR',TAXA) & is.na(COMM_NUM))]
set(dr, tmp, 'COMM_NUM', dr[tmp, COHORT])
# extraction IDs
set(dr, NULL, 'EXTRACT_ID', dr[,as.integer(gsub('^0+','',EXTRACT_ID))])
# sample date
dr[, SAMPLEDATEc:= cut(SAMPLEDATE, breaks=seq(2009, 2015, 0.25), labels=seq(2009, 2015-0.25, 0.25))]
# amplicon
dr[, AMPLICON:= dr[,factor(GENE, levels=c('2F-1R','1R-3F','2R-4F','4F-3R'), labels=c('1','2','3','4'))]]
#
# exclude AfricaCentre because pre-selected after amplification
#
dr <- subset(dr, COHORT!='AC_Resistance')
#
# re-level so that 'presumably good' factors are the reference
#
set(dr, NULL, 'AMPLICON', dr[, relevel(factor(AMPLICON), ref='1')])
set(dr, NULL, 'FLANK3', dr[, relevel(factor(FLANK3), ref='assembled')])
set(dr, NULL, 'FLANK2', dr[, relevel(factor(FLANK2), ref='no_mutation')])
set(dr, NULL, 'FLANK', dr[, relevel(factor(FLANK), ref='no_mutation')])
# defining the reference level for batches is tricky:
# if it s a very good batch, then this could simply be down to high VL
# if it s an RCCS batch with average viral load, then it could still be down to community?
# OK how about we take a batch that performs as well as a typical UG-MRC run, eg 66% on pol?
# --> this is '14683'
set(dr, NULL, 'BATCH', dr[, relevel(factor(BATCH), ref='14683 (BW-Mochudi)')])
set(dr, NULL, 'COHORT', dr[, relevel(factor(COHORT), ref='BW-Mochudi')])
#set(dr, NULL, 'COHORT', dr[, relevel(factor(COHORT), ref='AC_Resistance')])
set(dr, NULL, 'VL', dr[, relevel(factor(VL), ref='>1e5')])
set(dr, NULL, 'BATCHVL', dr[, relevel(factor(BATCHVL), ref='4e4-1e5')])
#set(dr, NULL, 'BATCHVL', dr[, relevel(factor(BATCHVL), ref='5e4-1e5')])
set(dr, NULL, 'ART', dr[, relevel(ART, ref='no ART')])
set(dr, NULL, 'COMET_1F1R', dr[, relevel(factor(COMET_1F1R), ref='A1')])
set(dr, NULL, 'COMET_3F4F', dr[, relevel(factor(COMET_3F4F), ref='A1')])
set(dr, NULL, 'COMET_4F3R', dr[, relevel(factor(COMET_4F3R), ref='A1')])
set(dr, NULL, 'COMET_CONS', dr[, relevel(factor(COMET_CONS), ref='A1')])
set(dr, NULL, 'LOC', dr[, relevel(factor(LOC), ref='13')])
#set(dr, NULL, 'COMM_NUM', dr[, relevel(factor(COMM_NUM), ref='106')])
set(dr, NULL, 'COMM_NUM', dr[, relevel(factor(COMM_NUM), ref='BW-Mochudi')])
set(dr, NULL, 'SAMPLEDATEc', dr[, relevel(factor(SAMPLEDATEc), ref='2010.25')])
#
# select data
#
ggplot(dr, aes(x=UNASS)) + geom_histogram() + facet_wrap(~GENE)
# use <60% vs >80%
dr[, ASS:= as.numeric(as.character(cut(UNASS, breaks=c(-1, 0.6, 0.8, 2), labels=c('1','0.5','0'))))]
#dr[, ASS:= as.numeric(as.character(cut(UNASS, breaks=c(-1, 0.4, 0.6, 2), labels=c('1','0.5','0'))))]
dr <- subset(dr, ASS!=.5)
dr[, UNASS:= 1-ASS]
#
# prepare data sets for logistic regression
#
drs <- subset(dr, select=c("ASS", "UNASS", "GENE", "TAXA", "AMPLICON", "EXTRACT_ID", "PANGEA_ID", "BATCH", "BATCH2", "ART", "BATCHVL", "VL", "COHORT", "LOC", "COMM_NUM", "COMET_1F1R", "COMET_3F4F", "COMET_4F3R", "COMET_CONS", "FLANK", "FLANK2", "FLANK3"))
drs12 <- subset(dr, GENE%in%c('1R-3F','2F-1R'), select=c("ASS", "UNASS", "GENE", "TAXA", "AMPLICON", "EXTRACT_ID", "PANGEA_ID", "BATCH", "BATCH2", "ART", "BATCHVL", "VL", "COHORT", "LOC", "COMM_NUM", "COMET_1F1R", "COMET_3F4F", "COMET_4F3R", "COMET_CONS", "FLANK", "FLANK2", "FLANK3"))
drsnpr <- subset(drs, !grepl('PRIOR',TAXA))
drstnpr <- subset(dr, !grepl('PRIOR',TAXA), select=c("ASS", "UNASS", "GENE", "TAXA", "AMPLICON", "EXTRACT_ID", "PANGEA_ID", "BATCH", "BATCH2", "ART", "BATCHVL", "VL", "COHORT", "LOC", "COMM_NUM", "COMET_1F1R", "COMET_3F4F", "COMET_4F3R", "COMET_CONS", "FLANK", "FLANK2", "FLANK3", "SAMPLEDATEc", 'SAMPLEDATE'))
#
# identify sig plates
#
m1 <- glm(data=drs, UNASS ~ VL + COHORT + AMPLICON + FLANK + ART + BATCHVL + BATCH , family='binomial')
m1.or <- cbind(data.table(COEF=names(coef(m1))), as.data.table( exp(cbind(OR = coef(m1), confint(m1))) ) )
setnames(m1.or, c('2.5 %','97.5 %'), c('l95','u95'))
m1a <- glm(data=drs12, UNASS ~ VL + COHORT + FLANK + ART + BATCHVL + BATCH , family='binomial')
m1a.or <- cbind(data.table(COEF=names(coef(m1a))), as.data.table( exp(cbind(OR = coef(m1a), confint(m1a))) ) )
setnames(m1a.or, c('2.5 %','97.5 %'), c('l95','u95'))
sig.plates <- subset(m1a.or, l95>1 & grepl('BATCH[0-9]+',COEF))[, as.integer(regmatches(COEF, regexpr('[0-9]+',COEF)))]
sig.plates <- c(sig.plates, 16033, 15934)
# sig.plates characteristics
length(sig.plates)
subset(dm, as.integer(gsub('_.*','',SANGER_ID))%in%sig.plates )
subset(dm, as.integer(gsub('_.*','',SANGER_ID))%in%sig.plates )[, table(COHORT)]
# plot significant batches
tmp <- subset(drstnpr, GENE=='1R-3F')
tmp2 <- subset(m1a.or, l95>1 & OR<=3 & grepl('BATCH',COEF))[, as.integer(regmatches(COEF, regexpr('[0-9]+',COEF)))]
tmp2 <- tmp[, which(BATCH2%in%tmp2)]
set(tmp, tmp2, 'BATCH', tmp[tmp2,paste(BATCH,' *',sep='')])
tmp2 <- subset(m1a.or, l95>1 & OR>3 & OR<=10 & grepl('BATCH',COEF))[, as.integer(regmatches(COEF, regexpr('[0-9]+',COEF)))]
tmp2 <- tmp[, which(BATCH2%in%tmp2)]
set(tmp, tmp2, 'BATCH', tmp[tmp2,paste(BATCH,' **',sep='')])
tmp2 <- subset(m1a.or, l95>1 & OR>10 & grepl('BATCH',COEF))[, as.integer(regmatches(COEF, regexpr('[0-9]+',COEF)))]
tmp2 <- tmp[, which(BATCH2%in%tmp2)]
set(tmp, tmp2, 'BATCH', tmp[tmp2,paste(BATCH,' ***',sep='')])
tmp2 <- tmp[, which(COHORT=='RCCS')]
set(tmp, tmp2, 'COMM_NUM', tmp[tmp2,paste('Rakai-',COMM_NUM,sep='')])
tmp2 <- tmp[, list(EXTRACT_MED=median(EXTRACT_ID)), by=c('BATCH')]
setkey(tmp2, EXTRACT_MED)
set(tmp2, NULL, 'BATCH3', tmp2[, factor(EXTRACT_MED, levels=EXTRACT_MED, labels=BATCH)])
tmp <- tmp[, list(N=length(TAXA), P=mean(UNASS==1), SD= ifelse(all(is.na(SAMPLEDATE)), -1L, as.integer(mean(SAMPLEDATE, na.rm=1)<2012.25))), by=c('BATCH','BATCH2','COMM_NUM')]
tmp <- merge(tmp, tmp2, by='BATCH')
tmp <- subset(tmp, !COMM_NUM%in%c('Rakai-RCCS','MRC'))
ggplot( tmp, aes(x=BATCH3, y=COMM_NUM, size=N, colour=P)) +
geom_point() +
scale_colour_gradient(low='blue', high='orange') +
theme_bw() + theme(axis.text.x=element_text(angle=90, vjust=1)) +
labs(x='sequencing plate (ordered by sample extraction at UCLH)', y='sampling location', colour='proportion of\nsamples with\n<20% assembled sites\nin 1R-3F', size='number of\nsamples', pch='Average\nsample date\nbefore 2012.25')
ggsave(file=file.path(wdir, 'PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment_batch_vs_community.pdf'), w=25, h=10, useDingbats=FALSE)
#
# exclude sig plates on data without priors
#
m2 <- glm(data=subset(drsnpr,!BATCH2%in%sig.plates), UNASS ~ VL + COHORT + AMPLICON + FLANK + ART, family='binomial')
summary(m2)
m2.or <- cbind(data.table(COEF=names(coef(m2))), as.data.table( exp(cbind(OR = coef(m2), confint(m2))) ) )
setnames(m2.or, c('2.5 %','97.5 %'), c('l95','u95'))
#
# subtype on data without priors and without sig plates
#
drsnprs <- subset(drsnpr, COHORT%in%c('RCCS','UG-MRC','BW-Mochudi') & COMET_CONS!='short' & !BATCH2%in%sig.plates)
m2.5.CONS <- glm(data=drsnprs, UNASS ~ VL + COHORT + ART + AMPLICON + FLANK + COMET_CONS, family='binomial')
summary(m2.5.CONS)
tmp <- m2.5.CONS
m2.5.CONS.or<- cbind(data.table(COEF=names(coef(tmp))), as.data.table( exp(cbind(OR = coef(tmp), confint(tmp))) ) )
setnames(m2.5.CONS.or, c('2.5 %','97.5 %'), c('l95','u95'))
#
# write odds ratio tables
#
tmp <- copy(m2.or)
set(tmp, NULL, 'OR', tmp[, round(OR,d=2)])
set(tmp, NULL, 'CI', tmp[, paste(round(l95,d=2),'-',round(u95,d=2),sep='')])
write.csv(subset(tmp, !is.na(OR), select=c(COEF, OR, CI)), row.names=FALSE, file=file.path(wdir, 'PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment_regression_model12.1e.csv'))
tmp <- copy(m2.5.CONS.or)
set(tmp, NULL, 'OR', tmp[, round(OR,d=2)])
set(tmp, NULL, 'CI', tmp[, paste(round(l95,d=2),'-',round(u95,d=2),sep='')])
write.csv(subset(tmp, !is.na(OR), select=c(COEF, OR, CI)), row.names=FALSE, file=file.path(wdir, 'PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment_regression_model2.5.CONS.csv'))
#
# run batches - subanalysis
#
drs2b <- copy(drs2a)
tmp2 <- unique(subset(drs2b, BATCH2>=15892 & BATCH2<=15964 & !is.na(COMM_NUM), COMM_NUM), by='COMM_NUM')
tmp2 <- unique(subset(merge(drs2b, tmp2, by='COMM_NUM'), select=BATCH2), by='BATCH2')
drs2b <- merge(drs2b, tmp2, by='BATCH2')
drs2b[, UNASSB:= 0L]
set(drs2b, drs2b[, which(BATCH2>=15892 & BATCH2<=15964 & !is.na(COMM_NUM))], 'UNASSB', 1L)
drs2b <- subset(drs2b, !grepl('PRIOR',TAXA), select=c("UNASSB", "TAXA", "PANGEA_ID", "BATCH", "PR_NTDIFFc", "ART", "BATCHVL", "VL", "COHORT", "LOC", "COMM_NUM", "ST", "SAMPLEDATE", "SAMPLEDATEc"))
set(drs2b, NULL, 'SAMPLEDATEc', drs2b[, relevel(factor(SAMPLEDATEc), ref='2011.75')])
write.csv(drs2b, row.names=FALSE, file=file.path(wdir, 'PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment_batch_vs_community.csv'))
s2.1 <- glm(data=subset(drs2b, !is.na(SAMPLEDATE)), UNASSB ~ VL + COMM_NUM + PR_NTDIFFc + ART + ST + SAMPLEDATE, family='binomial')
summary(s2.1)
s2.2 <- glm(data=subset(drs2b, !is.na(SAMPLEDATE)), UNASSB ~ VL + PR_NTDIFFc + ART + ST + SAMPLEDATEc, family='binomial')
summary(s2.2)
#
# do the predictions, use model m2.1
#
# exclude singular batches to get OK prediction
#
# data for predictions
tmp <- subset(m12.1.or, l95>1.05 & grepl('BATCH',COEF))[, as.integer(regmatches(COEF, regexpr('[0-9]+',COEF)))]
dpr <- subset(drs2npr, COHORT%in%c('RCCS','UG-MRC','BW-Mochudi') & COMET_CONS!='short' & !BATCH2%in%tmp)
# higher viral load
tmp <- subset(dpr, VL!='No VL measured')
pr0 <- predict(m2.5.CONS, newdata=tmp, type='response', se.fit=TRUE)
set(tmp, tmp[, which(VL%in%c('<1e4'))], 'VL', '1e4-2e4')
pr1 <- predict(m2.5.CONS, newdata=tmp, type='response', se.fit=TRUE)
set(tmp, tmp[, which(VL%in%c('<1e4', '1e4-2e4'))], 'VL', '2e4-4e4')
pr2 <- predict(m2.5.CONS, newdata=tmp, type='response', se.fit=TRUE)
set(tmp, tmp[, which(VL%in%c('<1e4', '1e4-2e4', '2e4-4e4'))], 'VL', '4e4-1e5')
pr3 <- predict(m2.5.CONS, newdata=tmp, type='response', se.fit=TRUE)
set(tmp, tmp[, which(VL%in%c('<1e4', '1e4-2e4', '2e4-4e4','4e4-1e5'))], 'VL', '>1e5')
pr4 <- predict(m2.5.CONS, newdata=tmp, type='response', se.fit=TRUE)
tmp[, H0u_p:= UNASS]
tmp[, H0all_p:= 1]
tmp[, H0_p:= pr0$fit]
tmp[, H0_l:= pr0$fit-1.96*pr0$se.fit]
tmp[, H0_u:= pr0$fit+1.96*pr0$se.fit]
tmp[, H1_p:= pr1$fit]
tmp[, H1_l:= pr1$fit-1.96*pr1$se.fit]
tmp[, H1_u:= pr1$fit+1.96*pr1$se.fit]
tmp[, H2_p:= pr2$fit]
tmp[, H2_l:= pr2$fit-1.96*pr2$se.fit]
tmp[, H2_u:= pr2$fit+1.96*pr2$se.fit]
tmp[, H3_p:= pr3$fit]
tmp[, H3_l:= pr3$fit-1.96*pr3$se.fit]
tmp[, H3_u:= pr3$fit+1.96*pr3$se.fit]
tmp[, H4_p:= pr4$fit]
tmp[, H4_l:= pr4$fit-1.96*pr4$se.fit]
tmp[, H4_u:= pr4$fit+1.96*pr4$se.fit]
tmp <- melt(tmp, id.vars=c('COHORT'), measure.vars=colnames(tmp)[grepl('^H[0-9]+',colnames(tmp))])
set(tmp, tmp[, which(value<0)], 'value', 0)
set(tmp, tmp[, which(value>1)], 'value', 1)
set(tmp, NULL, 'TYPE', tmp[, gsub('_','',regmatches(variable, regexpr('_[a-z]', variable)))])
set(tmp, NULL, 'variable', tmp[, regmatches(variable, regexpr('[A-Za-z0-9]+', variable))])
tmp <- tmp[, list(value=sum(value)), by=c('COHORT','TYPE','variable')]
tmp2 <- subset(tmp, variable=='H0' & TYPE=='p')
setnames(tmp2, 'value', 'BL')
tmp <- merge(tmp, subset(tmp2, select=c(COHORT, BL)), by='COHORT')
tmp2 <- subset(tmp, variable=='H0all')
setnames(tmp2, 'value', 'TOTAL')
tmp <- merge(tmp, subset(tmp2, select=c(COHORT, TOTAL)), by='COHORT')
tmp <- subset(tmp, variable!='H0u' & variable!='H0all' & TYPE=='p')
tmp[, REDUCTION_N:= BL-value]
tmp[, REDUCTION_P:= (BL-value)/TOTAL]
tmp <- subset(merge(tmp, data.table(variable=c('H1','H2','H3','H4'), LABEL=c('VL >1e4','VL >2e4','VL >4e4','VL >1e5')), by='variable'), select=c(COHORT, LABEL, REDUCTION_N, REDUCTION_P, TOTAL))
ans <- copy(tmp)
#
# primer sites assembled
tmp <- copy(dpr)
pr0 <- predict(m2.5.CONS, newdata=tmp, type='response', se.fit=TRUE)
set(tmp, tmp[, which(PR_NTDIFFc%in%c('2F or 2R unassembled'))], 'PR_NTDIFFc', '0')
pr1 <- predict(m2.5.CONS, newdata=tmp, type='response', se.fit=TRUE)
tmp[, H0u_p:= UNASS]
tmp[, H0all_p:= 1]
tmp[, H0_p:= pr0$fit]
tmp[, H0_l:= pr0$fit-1.96*pr0$se.fit]
tmp[, H0_u:= pr0$fit+1.96*pr0$se.fit]
tmp[, H1_p:= pr1$fit]
tmp[, H1_l:= pr1$fit-1.96*pr1$se.fit]
tmp[, H1_u:= pr1$fit+1.96*pr1$se.fit]
tmp <- melt(tmp, id.vars=c('COHORT'), measure.vars=colnames(tmp)[grepl('^H[0-9]+',colnames(tmp))])
set(tmp, tmp[, which(value<0)], 'value', 0)
set(tmp, tmp[, which(value>1)], 'value', 1)
set(tmp, NULL, 'TYPE', tmp[, gsub('_','',regmatches(variable, regexpr('_[a-z]', variable)))])
set(tmp, NULL, 'variable', tmp[, regmatches(variable, regexpr('[A-Za-z0-9]+', variable))])
tmp <- tmp[, list(value=sum(value)), by=c('COHORT','TYPE','variable')]
tmp2 <- subset(tmp, variable=='H0' & TYPE=='p')
setnames(tmp2, 'value', 'BL')
tmp <- merge(tmp, subset(tmp2, select=c(COHORT, BL)), by='COHORT')
tmp2 <- subset(tmp, variable=='H0all')
setnames(tmp2, 'value', 'TOTAL')
tmp <- merge(tmp, subset(tmp2, select=c(COHORT, TOTAL)), by='COHORT')
tmp <- subset(tmp, variable!='H0u' & variable!='H0all' & TYPE=='p')
tmp[, REDUCTION_N:= BL-value]
tmp[, REDUCTION_P:= (BL-value)/TOTAL]
tmp <- subset(merge(tmp, data.table(variable=c('H1'), LABEL=c('Assembled primer sites')), by='variable'), select=c(COHORT, LABEL, REDUCTION_N, REDUCTION_P, TOTAL))
ans <- rbind(ans, tmp)
#
# no mutations
tmp <- subset(dpr, PR_NTDIFFc!='2F or 2R unassembled')
pr0 <- predict(m2.5.CONS, newdata=tmp, type='response', se.fit=TRUE)
set(tmp, tmp[, which(PR_NTDIFFc%in%c('2F at least one mutation'))], 'PR_NTDIFFc', '0')
pr1 <- predict(m2.5.CONS, newdata=tmp, type='response', se.fit=TRUE)
set(tmp, tmp[, which(PR_NTDIFFc%in%c('2F at least one mutation','2R at least one mutation, 2F no mutation'))], 'PR_NTDIFFc', '0')
pr2 <- predict(m2.5.CONS, newdata=tmp, type='response', se.fit=TRUE)
tmp[, H0u_p:= UNASS]
tmp[, H0all_p:= 1]
tmp[, H0_p:= pr0$fit]
tmp[, H0_l:= pr0$fit-1.96*pr0$se.fit]
tmp[, H0_u:= pr0$fit+1.96*pr0$se.fit]
tmp[, H1_p:= pr1$fit]
tmp[, H1_l:= pr1$fit-1.96*pr1$se.fit]
tmp[, H1_u:= pr1$fit+1.96*pr1$se.fit]
tmp[, H2_p:= pr2$fit]
tmp[, H2_l:= pr2$fit-1.96*pr2$se.fit]
tmp[, H2_u:= pr2$fit+1.96*pr2$se.fit]
tmp <- melt(tmp, id.vars=c('COHORT'), measure.vars=colnames(tmp)[grepl('^H[0-9]+',colnames(tmp))])
set(tmp, tmp[, which(value<0)], 'value', 0)
set(tmp, tmp[, which(value>1)], 'value', 1)
set(tmp, NULL, 'TYPE', tmp[, gsub('_','',regmatches(variable, regexpr('_[a-z]', variable)))])
set(tmp, NULL, 'variable', tmp[, regmatches(variable, regexpr('[A-Za-z0-9]+', variable))])
tmp <- tmp[, list(value=sum(value)), by=c('COHORT','TYPE','variable')]
tmp2 <- subset(tmp, variable=='H0' & TYPE=='p')
setnames(tmp2, 'value', 'BL')
tmp <- merge(tmp, subset(tmp2, select=c(COHORT, BL)), by='COHORT')
tmp2 <- subset(tmp, variable=='H0all')
setnames(tmp2, 'value', 'TOTAL')
tmp <- merge(tmp, subset(tmp2, select=c(COHORT, TOTAL)), by='COHORT')
tmp <- subset(tmp, variable!='H0u' & variable!='H0all' & TYPE=='p')
tmp[, REDUCTION_N:= BL-value]
tmp[, REDUCTION_P:= (BL-value)/TOTAL]
tmp <- subset(merge(tmp, data.table(variable=c('H1','H2'), LABEL=c('PR 2F no mutation','PR 2F, 2R no mutation')), by='variable'), select=c(COHORT, LABEL, REDUCTION_N, REDUCTION_P, TOTAL))
ans <- rbind(ans, tmp)
#
# subtype (from sub analysis)
tmp <- copy(dpr)
pr0 <- predict(m2.5.CONS, newdata=tmp, type='response', se.fit=TRUE)
set(tmp, tmp[, which(COMET_CONS%in%c('D'))], 'COMET_CONS', 'A1')
pr1 <- predict(m2.5.CONS, newdata=tmp, type='response', se.fit=TRUE)
set(tmp, tmp[, which(COMET_CONS%in%c('D','C'))], 'COMET_CONS', 'A1')
pr2 <- predict(m2.5.CONS, newdata=tmp, type='response', se.fit=TRUE)
set(tmp, tmp[, which(COMET_CONS%in%c('D','C','pot_recombinant'))], 'COMET_CONS', 'A1')
pr3 <- predict(m2.5.CONS, newdata=tmp, type='response', se.fit=TRUE)
set(tmp, tmp[, which(COMET_CONS%in%c('D','C','other','pot_recombinant'))], 'COMET_CONS', 'A1')
pr4 <- predict(m2.5.CONS, newdata=tmp, type='response', se.fit=TRUE)
tmp[, H0u_p:= UNASS]
tmp[, H0all_p:= 1]
tmp[, H0_p:= pr0$fit]
tmp[, H0_l:= pr0$fit-1.96*pr0$se.fit]
tmp[, H0_u:= pr0$fit+1.96*pr0$se.fit]
tmp[, H1_p:= pr1$fit]
tmp[, H1_l:= pr1$fit-1.96*pr1$se.fit]
tmp[, H1_u:= pr1$fit+1.96*pr1$se.fit]
tmp[, H2_p:= pr2$fit]
tmp[, H2_l:= pr2$fit-1.96*pr2$se.fit]
tmp[, H2_u:= pr2$fit+1.96*pr2$se.fit]
tmp[, H3_p:= pr3$fit]
tmp[, H3_l:= pr3$fit-1.96*pr3$se.fit]
tmp[, H3_u:= pr3$fit+1.96*pr3$se.fit]
tmp[, H4_p:= pr4$fit]
tmp[, H4_l:= pr4$fit-1.96*pr4$se.fit]
tmp[, H4_u:= pr4$fit+1.96*pr4$se.fit]
tmp <- melt(tmp, id.vars=c('COHORT'), measure.vars=colnames(tmp)[grepl('^H[0-9]+',colnames(tmp))])
set(tmp, tmp[, which(value<0)], 'value', 0)
set(tmp, tmp[, which(value>1)], 'value', 1)
set(tmp, NULL, 'TYPE', tmp[, gsub('_','',regmatches(variable, regexpr('_[a-z]', variable)))])
set(tmp, NULL, 'variable', tmp[, regmatches(variable, regexpr('[A-Za-z0-9]+', variable))])
tmp <- tmp[, list(value=sum(value)), by=c('COHORT','TYPE','variable')]
tmp2 <- subset(tmp, variable=='H0' & TYPE=='p')
setnames(tmp2, 'value', 'BL')
tmp <- merge(tmp, subset(tmp2, select=c(COHORT, BL)), by='COHORT')
tmp2 <- subset(tmp, variable=='H0all')
setnames(tmp2, 'value', 'TOTAL')
tmp <- merge(tmp, subset(tmp2, select=c(COHORT, TOTAL)), by='COHORT')
tmp <- subset(tmp, variable!='H0u' & variable!='H0all' & TYPE=='p')
tmp[, REDUCTION_N:= BL-value]
tmp[, REDUCTION_P:= (BL-value)/TOTAL]
tmp <- subset(merge(tmp, data.table(variable=c('H1','H2','H3','H4'), LABEL=c('D','D,C','D,C,pot recombinant','D,C,other,pot recomb')), by='variable'), select=c(COHORT, LABEL, REDUCTION_N, REDUCTION_P, TOTAL))
ans <- rbind(ans, tmp)
#
# no ART self report
tmp <- copy(dpr)
pr0 <- predict(m2.5.CONS, newdata=tmp, type='response', se.fit=TRUE)
set(tmp, tmp[, which(ART%in%c('ART started or self reported'))], 'ART', 'no ART')
pr1 <- predict(m2.5.CONS, newdata=tmp, type='response', se.fit=TRUE)
tmp[, H0u_p:= UNASS]
tmp[, H0all_p:= 1]
tmp[, H0_p:= pr0$fit]
tmp[, H0_l:= pr0$fit-1.96*pr0$se.fit]
tmp[, H0_u:= pr0$fit+1.96*pr0$se.fit]
tmp[, H1_p:= pr1$fit]
tmp[, H1_l:= pr1$fit-1.96*pr1$se.fit]
tmp[, H1_u:= pr1$fit+1.96*pr1$se.fit]
tmp <- melt(tmp, id.vars=c('COHORT'), measure.vars=colnames(tmp)[grepl('^H[0-9]+',colnames(tmp))])
set(tmp, tmp[, which(value<0)], 'value', 0)
set(tmp, tmp[, which(value>1)], 'value', 1)
set(tmp, NULL, 'TYPE', tmp[, gsub('_','',regmatches(variable, regexpr('_[a-z]', variable)))])
set(tmp, NULL, 'variable', tmp[, regmatches(variable, regexpr('[A-Za-z0-9]+', variable))])
tmp <- tmp[, list(value=sum(value)), by=c('COHORT','TYPE','variable')]
tmp2 <- subset(tmp, variable=='H0' & TYPE=='p')
setnames(tmp2, 'value', 'BL')
tmp <- merge(tmp, subset(tmp2, select=c(COHORT, BL)), by='COHORT')
tmp2 <- subset(tmp, variable=='H0all')
setnames(tmp2, 'value', 'TOTAL')
tmp <- merge(tmp, subset(tmp2, select=c(COHORT, TOTAL)), by='COHORT')
tmp <- subset(tmp, variable!='H0u' & variable!='H0all' & TYPE=='p')
tmp[, REDUCTION_N:= BL-value]
tmp[, REDUCTION_P:= (BL-value)/TOTAL]
tmp <- subset(merge(tmp, data.table(variable=c('H1'), LABEL=c('ART not self-reported or started')), by='variable'), select=c(COHORT, LABEL, REDUCTION_N, REDUCTION_P, TOTAL))
ans <- rbind(ans, tmp)
#
# RCCS batches as typical UG-MRC 15034 (UG-MRC)
# bad batches as typical from same region 15699 (RCCS)
tmp <- subset(drs2npr, COHORT=='RCCS')
pr0 <- predict(m2.1, newdata=tmp, type='response', se.fit=TRUE)
tmp2 <- subset(m12.1.or, l95>1 & grepl('BATCH',COEF))[, gsub('BATCH','',COEF)]
set(tmp, tmp[, which(BATCH%in%tmp2)], 'BATCH', '15699 (RCCS)')
pr1 <- predict(m2.1, newdata=tmp, type='response', se.fit=TRUE)
tmp2 <- subset(subset(drs2a, !grepl('PRIOR',TAXA) & grepl('RCCS',BATCH))[, list(P_UNASS=round(sum(1-ASS)/length(ASS), d=2)), by='BATCH'], P_UNASS>0.38)[, BATCH]
set(tmp, tmp[, which(BATCH%in%tmp2)], 'BATCH', '15034 (UG-MRC)')
pr2 <- predict(m2.1, newdata=tmp, type='response', se.fit=TRUE)
tmp[, H0u_p:= UNASS]
tmp[, H0all_p:= 1]
tmp[, H0_p:= pr0$fit]
tmp[, H0_l:= pr0$fit-1.96*pr0$se.fit]
tmp[, H0_u:= pr0$fit+1.96*pr0$se.fit]
tmp[, H1_p:= pr1$fit]
tmp[, H1_l:= pr1$fit-1.96*pr1$se.fit]
tmp[, H1_u:= pr1$fit+1.96*pr1$se.fit]
tmp[, H2_p:= pr2$fit]
tmp[, H2_l:= pr2$fit-1.96*pr2$se.fit]
tmp[, H2_u:= pr2$fit+1.96*pr2$se.fit]
tmp <- melt(tmp, id.vars=c('COHORT'), measure.vars=colnames(tmp)[grepl('^H[0-9]+',colnames(tmp))])
set(tmp, tmp[, which(value<0)], 'value', 0)
set(tmp, tmp[, which(value>1)], 'value', 1)
set(tmp, NULL, 'TYPE', tmp[, gsub('_','',regmatches(variable, regexpr('_[a-z]', variable)))])
set(tmp, NULL, 'variable', tmp[, regmatches(variable, regexpr('[A-Za-z0-9]+', variable))])
tmp <- tmp[, list(value=sum(value)), by=c('COHORT','TYPE','variable')]
tmp2 <- subset(tmp, variable=='H0' & TYPE=='p')
setnames(tmp2, 'value', 'BL')
tmp <- merge(tmp, subset(tmp2, select=c(COHORT, BL)), by='COHORT')
tmp2 <- subset(tmp, variable=='H0all')
setnames(tmp2, 'value', 'TOTAL')
tmp <- merge(tmp, subset(tmp2, select=c(COHORT, TOTAL)), by='COHORT')
tmp <- subset(tmp, variable!='H0u' & variable!='H0all' & TYPE=='p')
tmp[, REDUCTION_N:= BL-value]
tmp[, REDUCTION_P:= (BL-value)/TOTAL]
tmp <- subset(merge(tmp, data.table(variable=c('H1','H2'), LABEL=c('no sig plates','as avg UG MRC')), by='variable'), select=c(COHORT, LABEL, REDUCTION_N, REDUCTION_P, TOTAL))
ans <- rbind(ans, tmp)
#
# write csv
set(ans, NULL, 'REDUCTION_P_L', ans[, paste(round(REDUCTION_P*100,d=1),'pc',sep='')])
#ans[, paste(unique(LABEL),collapse='", "')]
#set(ans, NULL, 'LABEL', ans[, factor(LABEL, levels=c("Assembled primer sites", "VL >1e4", "VL >2e4", "VL >4e4", "VL >1e5", "D", "Batch run as typical run from same comm", "PR 2F no mutation", "PR 2F, 2R no mutation", "ART not self-reported or started"))])
ans <- dcast.data.table(ans, LABEL~COHORT, value.var='REDUCTION_P_L')
write.csv(ans, row.names=FALSE, file=file.path(wdir, 'PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment_percentreductions.csv'))
#
# save
#
save(drs2, m2.1, m2.2, m2.3, m2.1.or, file=file.path(wdir, gsub('.rda','_logistic_160905.rda',wfile)))
tmp <- copy(m2.1.or)
set(tmp, NULL, 'COEF', tmp[, gsub('COHORT','',gsub('PR_NTDIFFc','',gsub('VL','viral load ',gsub('BATCH','plate ',COEF))))])
set(tmp, NULL, 'OR', tmp[, round(OR,d=2)])
set(tmp, NULL, 'LABEL', tmp[, as.character(OR)])
set(tmp, NULL, 'l95', tmp[, round(l95,d=2)])
set(tmp, NULL, 'u95', tmp[, round(u95,d=2)])
set(tmp, NULL, 'LABEL2', tmp[, paste(l95,'-',u95,sep='')])
tmp2 <- tmp[, which(l95>1 & l95<=3)]
set(tmp, tmp2, 'LABEL', tmp[tmp2,paste(LABEL,' *',sep='')])
tmp2 <- tmp[, which(l95>3)]
set(tmp, tmp2, 'LABEL', tmp[tmp2,paste(LABEL,' **',sep='')])
write.csv(subset(tmp, !is.na(OR), select=c(COEF,LABEL,LABEL2)), row.names=FALSE, file=file.path(wdir, gsub('.rda','_logistic_160905.csv',wfile)))
#
# old stuff
#
#
# what are these batches?
#
set(drs2a, NULL, 'BATCH', drs2a[, as.integer(regmatches(BATCH,regexpr('[0-9]+',BATCH)))])
tmp2 <- subset(m2.1.or, grepl('BATCH',COEF) & grepl('(RCCS)',COEF,fixed=1) & l95>1)[, as.integer(regmatches(COEF,regexpr('[0-9]+',COEF)))]
db <- subset(drs2a, BATCH %in% tmp2)
db[, BATCH_UNASS:='Y']
tmp <- subset(drs2a, COHORT=='RCCS' & !BATCH%in%tmp)
tmp[, BATCH_UNASS:='N']
db <- rbind(db,tmp)
# compare first locations ( % of samples from... )
tmp <- db[, {
#z <- subset(db, BATCH_UNASS=='Y')[,table(as.character(COMM_NUM))]
z <- table(as.character(COMM_NUM))
ans <- as.data.table(binconf(z, sum(z)))
ans[, TYPE:=names(z)]
setnames(ans, c('PointEst','Lower','Upper'),c('central','l95','u95'))
ans <- melt(ans, id.vars='TYPE')
set(ans, NULL, 'value', ans[,round(value*100,d=1)])
ans
}, by='BATCH_UNASS']
dcast.data.table(tmp, TYPE~BATCH_UNASS+variable)
#
# batch models with BATCH VL
#
m.1 <- glm(data=drs, ASS ~ VL + COHORT + PR_NTDIFFc + ART + BATCHVL + BATCH, family='binomial')
summary(m.1)
# batch VL not significant, drop..
m.2 <- glm(data=drs, ASS ~ BATCH + VL + COHORT + PR_NTDIFFc + ART - 1, family='binomial')
summary(m.2)
# with batches, UG-MRC worse than RCCS + PR_NTDIFFc2F/2R mut significant, but also PR_NTDIFFc2R0 significant (simply knock on)
m.3 <- glm(data=drs, ASS ~ VL + COHORT + PR_NTDIFFc + ART, family='binomial')
summary(m.3)
# without batches, RCCS worse than UG-MRC: batches offer better explanation than study for RCCS
m.2.or<- cbind(data.table(COEF=names(coef(m.2))), as.data.table( exp(cbind(OR = coef(m.2), confint(m.2))) ) )
setnames(m.2.or, c('2.5 %','97.5 %'), c('l95','u95'))
subset(m.2.or, u95<0.95)
#
# batch models with BATCH VL
#
m2f.1 <- glm(data=drs2f, ASS ~ VL + COHORT + ART + NT_DIFFc + BATCHVL + BATCH, family='binomial')
summary(m2f.1)
m2f.1.or<- cbind(data.table(COEF=names(coef(m2f.1))), as.data.table( exp(cbind(OR = coef(m2f.1), confint(m2f.1))) ) )
setnames(m2f.1.or, c('2.5 %','97.5 %'), c('l95','u95'))
m2r.1 <- glm(data=drs2r, ASS ~ VL + COHORT + ART + NT_DIFF_2Rc + BATCHVL + BATCH, family='binomial')
summary(m2r.1)
m2r.1.or<- cbind(data.table(COEF=names(coef(m2r.1))), as.data.table( exp(cbind(OR = coef(m2r.1), confint(m2r.1))) ) )
setnames(m2r.1.or, c('2.5 %','97.5 %'), c('l95','u95'))
#
# batch models without BATCH VL
#
m2f.2 <- glm(data=drs2f, ASS ~ VL + COHORT + ART + NT_DIFF_2Fc + BATCH, family='binomial')
summary(m2f.2)
m2f.2.or<- cbind(data.table(COEF=names(coef(m2f.2))), as.data.table( exp(cbind(OR = coef(m2f.2), confint(m2f.2))) ) )
setnames(m2f.2.or, c('2.5 %','97.5 %'), c('l95','u95'))
m2r.2 <- glm(data=drs2r, ASS ~ VL + COHORT + ART + NT_DIFF_2Rc + BATCH, family='binomial')
summary(m2r.2)
m2r.2.or<- cbind(data.table(COEF=names(coef(m2r.2))), as.data.table( exp(cbind(OR = coef(m2r.2), confint(m2r.2))) ) )
setnames(m2r.2.or, c('2.5 %','97.5 %'), c('l95','u95'))
#
# no BATCHES
#
m2f.3 <- glm(data=drs2f, ASS ~ VL + COHORT + ART + NT_DIFF_2Fc, family='binomial')
summary(m2f.3)
m2f.3.or<- cbind(data.table(COEF=names(coef(m2f.3))), as.data.table( exp(cbind(OR = coef(m2f.3), confint(m2f.3))) ) )
setnames(m2f.3.or, c('2.5 %','97.5 %'), c('l95','u95'))
m2r.3 <- glm(data=drs2r, ASS ~ VL + COHORT + ART + NT_DIFF_2Rc, family='binomial')
summary(m2r.3)
m2r.3.or<- cbind(data.table(COEF=names(coef(m2r.3))), as.data.table( exp(cbind(OR = coef(m2r.3), confint(m2r.3))) ) )
setnames(m2r.3.or, c('2.5 %','97.5 %'), c('l95','u95'))
#
# no BATCHES and subtype
#
m2f.4 <- glm(data=drs2f, ASS ~ VL + COHORT + ART + ST + NT_DIFF_2Fc, family='binomial')
summary(m2f.4)
m2f.4.or<- cbind(data.table(COEF=names(coef(m2f.4))), as.data.table( exp(cbind(OR = coef(m2f.4), confint(m2f.4))) ) )
setnames(m2f.4.or, c('2.5 %','97.5 %'), c('l95','u95'))
m2r.4 <- glm(data=drs2r, ASS ~ VL + COHORT + ART + ST + NT_DIFF_2Rc, family='binomial')
summary(m2r.4)
m2r.4.or<- cbind(data.table(COEF=names(coef(m2r.4))), as.data.table( exp(cbind(OR = coef(m2r.4), confint(m2r.4))) ) )
setnames(m2r.4.or, c('2.5 %','97.5 %'), c('l95','u95'))
#
# NT_DIFF_2Fcat least one mutation in models 3,4 and stronger when batches included
# NT_DIFF_2Rcat least one mutation not sig
#
# NT_DIFF_2FcUnassembled and NT_DIFF_2RcUnassembled always sig
#
# STD and STB or C significant for 2F
# compare significant batches
tmp <- subset(m2f.1.or, u95<0.95)
tmp[, PR:='2F']
tmp[, MODEL:= 'adjusting for avg VL in batch']
tmp2 <- subset(m2r.1.or, u95<0.95)
tmp2[, PR:='2R']
tmp2[, MODEL:= 'adjusting for avg VL in batch']
tmp <- rbind(tmp, tmp2)
tmp2 <- subset(m2f.2.or, u95<0.95)
tmp2[, PR:='2F']
tmp2[, MODEL:= 'not adjusting for avg VL in batch']
tmp <- rbind(tmp, tmp2)
tmp2 <- subset(m2r.2.or, u95<0.95)
tmp2[, PR:='2R']
tmp2[, MODEL:= 'not adjusting for avg VL in batch']
tmp <- rbind(tmp, tmp2)
tmp <- subset(tmp, grepl('BATCH', COEF) & COEF!='BATCHNo matched ID')
set(tmp, NULL, 'BATCH', tmp[,gsub('BATCH','',COEF)])
tmp <- merge(tmp, unique(subset(dr, select=c(BATCH, COHORT)), by=c('BATCH','COHORT')), by='BATCH')
#
ggplot(subset(tmp, MODEL=='adjusting for avg VL in batch' & COHORT=='RCCS'), aes(x=BATCH, y=OR, ymin=l95, ymax=u95)) + geom_point() + geom_errorbar() + facet_grid(~PR) + coord_flip()
ggsave(file=file.path(wdir, 'PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment_batch_oddsratio_BATCHVL.pdf'), w=7, h=10)
ggplot(subset(tmp, MODEL=='not adjusting for avg VL in batch' & COHORT=='RCCS'), aes(x=BATCH, y=OR, ymin=l95, ymax=u95)) + geom_point() + geom_errorbar() + facet_grid(~PR) + coord_flip()
ggsave(file=file.path(wdir, 'PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment_batch_oddsratio_NOBATCHVL.pdf'), w=7, h=10)
#
subset(m2r.4.or, u95<0.95)
subset(m2r.1.or, u95<0.95)
#
# batch models without BATCH VL and ignoring AC resistance
#
m2f.2b <- glm(data=subset(drs2f, COHORT!='AC_Resistance'), ASS ~ VL + COHORT + ART + NT_DIFF_2Fc + BATCH, family='binomial')
summary(m2f.2b)
m2f.2b.or<- cbind(data.table(COEF=names(coef(m2f.2b))), as.data.table( exp(cbind(OR = coef(m2f.2b), confint(m2f.2b))) ) )
setnames(m2f.2b.or, c('2.5 %','97.5 %'), c('l95','u95'))
m2r.2b <- glm(data=subset(drs2r, COHORT!='AC_Resistance'), ASS ~ VL + COHORT + ART + NT_DIFF_2Rc + BATCH, family='binomial')
summary(m2r.2b)
m2r.2b.or<- cbind(data.table(COEF=names(coef(m2r.2b))), as.data.table( exp(cbind(OR = coef(m2r.2b), confint(m2r.2b))) ) )
setnames(m2r.2b.or, c('2.5 %','97.5 %'), c('l95','u95'))
#
# Rakai model with communities
#
m2f.4 <- glm(data=drs2f, ASS ~ VL + ART + LOC + COMM_NUM + NT_DIFF_2Fc, family='binomial')
summary(m2f.4)
m2f.4.or<- cbind(data.table(COEF=names(coef(m2f.4))), as.data.table( exp(cbind(OR = coef(m2f.4), confint(m2f.4))) ) )
setnames(m2f.4.or, c('2.5 %','97.5 %'), c('l95','u95'))
subset(m2f.4.or, u95<0.95)
m2r.4 <- glm(data=drs2r, ASS ~ VL + ART + LOC + COMM_NUM + NT_DIFF_2Rc, family='binomial')
summary(m2r.4)
m2r.4.or<- cbind(data.table(COEF=names(coef(m2r.4))), as.data.table( exp(cbind(OR = coef(m2r.4), confint(m2r.4))) ) )
setnames(m2r.4.or, c('2.5 %','97.5 %'), c('l95','u95'))
subset(m2r.4.or, u95<0.95)
save(drs2f, drs2r, m2f.1, m2r.1, m2f.1.or, m2r.1.or, m2f.2, m2r.2, m2f.2.or, m2r.2.or, m2f.3, m2r.3, m2f.3.or, m2r.3.or, m2f.4, m2r.4, m2f.4.or, m2r.4.or, file=file.path(wdir, gsub('.rda','_logistic.rda',wfile)))
}
treecomparison.explaingaps.regressions.160804<- function()
{
require(ape)
require(data.table)
wdir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/explaingaps'
wfile <- 'PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment.rda'
load(file.path(wdir, wfile))
#
# merge MRC Uganda
#
set(dm, NULL, 'COMM_NUM', dm[, as.character(COMM_NUM)])
tmp <- dm[, which(grepl('MRC',COHORT))]
set(dm, tmp, 'COMM_NUM', dm[tmp, COHORT])
set(dm, tmp, 'COHORT', 'UG-MRC')
# find reference batch for three gene regions '1R-3F-firsthalf' '1R-3F-secondhalf' '1R-3F'
if(0)
{
tmp <- subset(dm, select=c(TAXA, SANGER_ID))
tmp[, BATCH:= regmatches(SANGER_ID,regexpr('^[0-9]+', SANGER_ID))]
tmp <- merge(dgd, tmp, by='TAXA')
dbatch <- subset(tmp, GENE%in%c('1R-3F','1R-3F-firsthalf','1R-3F-secondhalf'))[, list(ASS_AVG= mean(1-UNASS)), by=c('GENE','BATCH')]
dbatch <- dbatch[, {
z<- sort(abs(ASS_AVG-0.66),index.return=TRUE)$ix[1:10]
list(REFBATCH= BATCH[z], REF_ASS_AVG=ASS_AVG[z])
}, by='GENE']
dbatch <- dcast.data.table(dbatch, REFBATCH~GENE, value.var='REF_ASS_AVG')
# 14683 is among the 10 best for all gene regions -- take this one (from BW)
}
# calculate total mutational distance in all primers
dr <- dpand[, list(NT_DIFF_SUM=sum(NT_DIFF)), by=c('TAXA','PR')]
#dr[, length(TAXA), by='PR']
set(dr, NULL, 'PR', dr[, paste('NT_DIFF_',PR,sep='')])
# merge with gene regions
tmp <- subset(dgd, grepl('half',GENE) | (!grepl('GAG|POL|ENV',GENE) & !grepl('half',GENE)))
set(tmp, NULL, c('START','END','ACTG'), NULL)
dr <- merge(dr, tmp, by='TAXA',allow.cartesian=TRUE)
# select only meaningful comparisons between primer and gene region
dr <- subset(dr, (PR%in%c("NT_DIFF_2R","NT_DIFF_2F")&GENE=='1R-3F') |
(PR=="NT_DIFF_1R"&GENE=='2F-1R') |
(PR=="NT_DIFF_3F"&GENE=='2R-4F')) #|
#(PR=="NT_DIFF_3R"&GENE=='4F-3R-secondhalf'))
#
# prepare regression factors
#
tmp <- subset(dm, select=c(PANGEA_ID, TAXA, STUDY_ID, EXTRACT_ID, SANGER_ID, SAMPLEDATE, ARTSTART, selfReportArt, everSelfReportArt, FirstSelfReportArt, CURRENTLYONART, RECENTVL, RECENTVL_U, RECENTVL_L, RECENTVLDATE, COMET_1F1R, COMET_3F4F, COMET_4F3R, COMET_CONS, COMET_1F1R_N, COMET_3F4F_N, COMET_4F3R_N, COMET_CONS_N, COHORT, LOC, COMM_NUM))
set(tmp, NULL, 'RECENTVL', tmp[, as.numeric(RECENTVL)])
dr <- merge(dr, tmp, by='TAXA')
# batches
dr[, BATCH:=NA_character_]
tmp <- dr[, which(!is.na(SANGER_ID))]
set(dr, tmp, 'BATCH', dr[tmp, regmatches(SANGER_ID,regexpr('^[0-9]+', SANGER_ID))])
dr[, BATCH2:= as.integer(BATCH)]
# add locations to BATCH id
tmp <- dr[, list(BATCH_LABEL=paste(BATCH, ' (', paste(unique(COHORT),collapse='+'), ')',sep='')), by='BATCH']
dr <- merge(dr, tmp, by='BATCH')
dr[, BATCH:=NULL]
setnames(dr, 'BATCH_LABEL', 'BATCH')
# to every batch add one 0 and one 1
if(1)
{
tmp <- dr[, list(TAXA=c('PRIOR1','PRIOR2'), PANGEA_ID=c('PRIOR1','PRIOR2'), STUDY_ID=c('PRIOR1','PRIOR2'), SANGER_ID=c('PRIOR1','PRIOR2'), UNASS=c(0,1), COHORT=COHORT[1] ), by=c('BATCH','PR')]
#tmp <- dr[, list(TAXA=c('PRIOR1','PRIOR2','PRIOR3','PRIOR4','PRIOR5','PRIOR6'), PANGEA_ID=c('PRIOR1','PRIOR2','PRIOR3','PRIOR4','PRIOR5','PRIOR6'), STUDY_ID=c('PRIOR1','PRIOR2','PRIOR3','PRIOR4','PRIOR5','PRIOR6'), SANGER_ID=c('PRIOR1','PRIOR2','PRIOR3','PRIOR4','PRIOR5','PRIOR6'), UNASS=c(0,0,0,1,1,1), COHORT=COHORT[1] ), by=c('BATCH','PR')]
dr <- rbind(dr, tmp, use.names=TRUE, fill=TRUE)
}
# average viral load per batch
tmp <- dr[, {
ans <- NA_character_
tmp <- which((COHORT=='BW-Mochudi' & !is.na(RECENTVL)) | abs(RECENTVLDATE-SAMPLEDATE)<1)
if(length(tmp)<length(COHORT)*.5)
ans <- 'No VL measured'
tmp <- as.character(cut(median(RECENTVL[tmp]), breaks=c(0, 1e4, 2e4, 4e4, 1e5, Inf), labels=c('<1e4','1e4-2e4','2e4-4e4','4e4-1e5','>1e5')))
#tmp <- as.character(cut(median(RECENTVL[tmp]), breaks=c(0, 1e4, 5e4, 1e5, Inf), labels=c('<1e4','1e4-5e4','5e4-1e5','>1e5')))
if(!is.na(tmp) & is.na(ans))
ans <- tmp
list(BATCHVL= ans)
}, by=c('BATCH','PR')]
dr <- merge(dr, tmp, by=c('BATCH','PR'))
# ART status
dr[, ART:= as.integer(ARTSTART<SAMPLEDATE)]
set(dr, dr[, which(is.na(ART))], 'ART', 0L)
set(dr, dr[, which(ART==0 & everSelfReportArt==1 & SAMPLEDATE<FirstSelfReportArt)], 'ART', 1L)
set(dr, dr[, which(ART==0 & COHORT=='BW-Mochudi' & CURRENTLYONART=='Y')], 'ART', 1L)
set(dr, dr[, which(ART==0 & COHORT=='UG-MRC' & CURRENTLYONART=='Y')], 'ART', 1L)
set(dr, dr[, which(ART==0 & COHORT=='AC_Resistance')], 'ART', 0L)
set(dr, NULL, 'ART', dr[, factor(ART, levels=c(0L,1L), labels=c('no ART', 'ART started or self reported'))])
# use categorical values from UCL
if(1)
{
set(dr, dr[, which(is.na(RECENTVL) & RECENTVL_U<=1e4)], 'RECENTVL', 1e4-1)
set(dr, dr[, which(is.na(RECENTVL) & RECENTVL_U<=5e4)], 'RECENTVL', 5e4-1)
set(dr, dr[, which(is.na(RECENTVL) & RECENTVL_U<=1e5)], 'RECENTVL', 1e5-1)
set(dr, dr[, which(is.na(RECENTVL) & RECENTVL_U>1e5)], 'RECENTVL', 1e5+1)
stopifnot( !nrow(subset(dr, !is.na(RECENTVL_U) & is.na(RECENTVL))) )
}
# recent viral load per individual
dr[, VL:='No VL measured']
tmp <- dr[, which((COHORT=='BW-Mochudi' & !is.na(RECENTVL)) | (!is.na(RECENTVL) & abs(RECENTVLDATE-SAMPLEDATE)<1))]
#set(dr, tmp, 'VL', dr[tmp, cut(RECENTVL, breaks=c(-1, 1e4, 2e4, 4e4, 1e5, Inf), labels=c('<1e4','1e4-2e4','2e4-4e4','4e4-1e5','>1e5'))])
set(dr, tmp, 'VL', dr[tmp, cut(RECENTVL, breaks=c(-1, 1e4, 5e4, 1e5, Inf), labels=c('<1e4','1e4-5e4','5e4-1e5','>1e5'))])
# subtype
# keep as is
#set(dr, dr[, which(COMET_Region1%in%c('B','C'))], 'ST', 'B or C')
# nucleotide mutations
dr[, NT_DIFFc:= as.character(NT_DIFF_SUM)]
set(dr, dr[, which(is.na(NT_DIFFc))], 'NT_DIFFc', 'Unassembled')
set(dr, dr[, which(!NT_DIFFc%in%c('0','Unassembled'))], 'NT_DIFFc', 'at least one mutation')
# primer assembled
dr[, ASSEMBLED:= 'Yes']
set(dr, dr[, which(is.na(NT_DIFF_SUM))], 'ASSEMBLED', 'No')
# primers
set(dr, NULL, 'PR', dr[, gsub('NT_DIFF_','',PR)])
# primers + nt_diff
set(dr, NULL, 'PR_NTDIFFc', dr[, paste(PR,'-',NT_DIFFc,sep='')])
# region, community number, household number
tmp <- dr[, which(!grepl('PRIOR',TAXA) & is.na(LOC))]
set(dr, tmp, 'LOC', dr[tmp, COHORT])
set(dr, NULL, 'COMM_NUM', dr[, as.character(COMM_NUM)])
tmp <- dr[, which(!grepl('PRIOR',TAXA) & is.na(COMM_NUM))]
set(dr, tmp, 'COMM_NUM', dr[tmp, COHORT])
# extraction IDs
set(dr, NULL, 'EXTRACT_ID', dr[,as.integer(gsub('^0+','',EXTRACT_ID))])
# sample date
dr[, SAMPLEDATEc:= cut(SAMPLEDATE, breaks=seq(2009, 2015, 0.25), labels=seq(2009, 2015-0.25, 0.25))]
# amplicon
dr[, AMPLICON:='two']
set(dr, dr[, which(PR=='1R')], 'AMPLICON', 'one')
set(dr, dr[, which(PR=='3F')], 'AMPLICON', 'three')
#
# exclude AfricaCentre because pre-selected after amplification
#
dr <- subset(dr, COHORT!='AC_Resistance')
#
# re-level so that 'presumably good' factors are the reference
#
set(dr, NULL, 'AMPLICON', dr[, relevel(factor(AMPLICON), ref='one')])
set(dr, NULL, 'PR', dr[, relevel(factor(PR), ref='1R')])
set(dr, NULL, 'PR_NTDIFFc', dr[, relevel(factor(PR_NTDIFFc), ref='1R-0')])
set(dr, NULL, 'ASSEMBLED', dr[, relevel(factor(ASSEMBLED), ref='Yes')])
# defining the reference level for batches is tricky:
# if it s a very good batch, then this could simply be down to high VL
# if it s an RCCS batch with average viral load, then it could still be down to community?
# OK how about we take a batch that performs as well as a typical UG-MRC run, eg 66% on pol?
# --> this is '14683'
set(dr, NULL, 'BATCH', dr[, relevel(factor(BATCH), ref='14683 (BW-Mochudi)')])
set(dr, NULL, 'COHORT', dr[, relevel(factor(COHORT), ref='BW-Mochudi')])
#set(dr, NULL, 'COHORT', dr[, relevel(factor(COHORT), ref='AC_Resistance')])
set(dr, NULL, 'VL', dr[, relevel(factor(VL), ref='>1e5')])
set(dr, NULL, 'BATCHVL', dr[, relevel(factor(BATCHVL), ref='4e4-1e5')])
#set(dr, NULL, 'BATCHVL', dr[, relevel(factor(BATCHVL), ref='5e4-1e5')])
set(dr, NULL, 'ART', dr[, relevel(ART, ref='no ART')])
set(dr, NULL, 'COMET_1F1R', dr[, relevel(factor(COMET_1F1R), ref='A1')])
set(dr, NULL, 'COMET_3F4F', dr[, relevel(factor(COMET_3F4F), ref='A1')])
set(dr, NULL, 'COMET_4F3R', dr[, relevel(factor(COMET_4F3R), ref='A1')])
set(dr, NULL, 'COMET_CONS', dr[, relevel(factor(COMET_CONS), ref='A1')])
set(dr, NULL, 'NT_DIFFc', dr[, relevel(factor(NT_DIFFc), ref='0')])
set(dr, NULL, 'LOC', dr[, relevel(factor(LOC), ref='13')])
#set(dr, NULL, 'COMM_NUM', dr[, relevel(factor(COMM_NUM), ref='106')])
set(dr, NULL, 'COMM_NUM', dr[, relevel(factor(COMM_NUM), ref='BW-Mochudi')])
set(dr, NULL, 'SAMPLEDATEc', dr[, relevel(factor(SAMPLEDATEc), ref='2010.25')])
#
# select data
#
ggplot(dr, aes(x=UNASS)) + geom_histogram() + facet_wrap(~PR+GENE)
# use <60% vs >80%
dr[, ASS:= as.numeric(as.character(cut(UNASS, breaks=c(-1, 0.6, 0.8, 2), labels=c('1','0.5','0'))))]
#dr[, ASS:= as.numeric(as.character(cut(UNASS, breaks=c(-1, 0.4, 0.6, 2), labels=c('1','0.5','0'))))]
dr <- subset(dr, ASS!=.5)
dr[, UNASS:= 1-ASS]
#
# prepare data sets for logistic regression
#
drs <- subset(dr, select=c("ASS", "UNASS", "TAXA", "PR", "GENE", "PR_NTDIFFc", "ASSEMBLED", "AMPLICON", "EXTRACT_ID", "PANGEA_ID", "BATCH", "BATCH2", "ART", "BATCHVL", "VL", "COHORT", "LOC", "COMM_NUM", "COMET_1F1R", "COMET_3F4F", "COMET_4F3R", "COMET_CONS", "NT_DIFFc"))
drs1r <- subset(drs, !grepl('PRIOR',TAXA) & PR=='1R' & GENE=='2F-1R')
#drs1r <- subset(drs, PR=='1R' & GENE=='2F-1R')
#set(drs1r, NULL, 'PR_NTDIFFc', drs1r[,as.character(factor(as.character(PR_NTDIFFc), levels=c('1R-0','1R-Unassembled','1R-at least one mutation'),labels=c('1R no mutation','1R unassembled','1R at least one mutation')))])
set(drs1r, NULL, 'PR_NTDIFFc', drs1r[,as.character(factor(as.character(PR_NTDIFFc), levels=c('1R-0','1R-Unassembled','1R-at least one mutation'),labels=c('no mutation','unassembled','1R at least one mutation')))])
drs3f <- subset(drs, !grepl('PRIOR',TAXA) & PR=='3F' & GENE=='2R-4F')
#drs3f <- subset(drs, PR=='3F' & GENE=='2R-4F')
#set(drs3f, NULL, 'PR_NTDIFFc', drs3f[,as.character(factor(as.character(PR_NTDIFFc), levels=c('3F-0','3F-Unassembled','3F-at least one mutation'),labels=c('3F no mutation','3F unassembled','3F at least one mutation')))])
set(drs3f, NULL, 'PR_NTDIFFc', drs3f[,as.character(factor(as.character(PR_NTDIFFc), levels=c('3F-0','3F-Unassembled','3F-at least one mutation'),labels=c('no mutation','unassembled','3F at least one mutation')))])
drs2f <- subset(drs, PR=='2F' & GENE=='1R-3F-firsthalf')
drs2r <- subset(drs, PR=='2R' & GENE=='1R-3F-secondhalf')
tmp <- subset(dr, is.na(GENE) | GENE=='1R-3F')
tmp2 <- dcast.data.table(tmp, BATCH+TAXA+ASS+UNASS~PR, value.var='NT_DIFFc')
setnames(tmp2, c('2F','2R'), c('p2F','p2R'))
#tmp2[, PR_NTDIFFc:='2F or 2R unassembled']
tmp2[, PR_NTDIFFc:='unassembled']
set(tmp2, tmp2[, which(p2F=='0' & p2R=='0')], 'PR_NTDIFFc', 'no mutation')
set(tmp2, tmp2[, which(p2F=='at least one mutation' & p2R=='0')], 'PR_NTDIFFc', '2F at least one mutation')
set(tmp2, tmp2[, which(p2F=='0' & p2R=='at least one mutation')], 'PR_NTDIFFc', '2R at least one mutation, 2F no mutation')
set(tmp2, tmp2[, which(p2F=='at least one mutation' & p2R=='at least one mutation')], 'PR_NTDIFFc', '2F at least one mutation')
tmp2[, ASSEMBLED:='Yes']
set(tmp2, tmp2[, which(grepl('unassembled',PR_NTDIFFc))], 'ASSEMBLED', 'No')
set(tmp2, NULL, 'PR_NTDIFFc', tmp2[, relevel(factor(PR_NTDIFFc), ref='no mutation')])
set(tmp2, NULL, 'ASSEMBLED', tmp2[, relevel(factor(ASSEMBLED), ref='Yes')])
tmp2[, AMPLICON:='two']
tmp <- subset(tmp, PR=='2F', select=setdiff(colnames(tmp),c('PR','NT_DIFFc','PR_NTDIFFc','UNASS','ASS','NT_DIFF_SUM','ASSEMBLED','GENE','LEN','ACTG','AMPLICON')))
drs1r <- merge(tmp, subset(drs1r,select=c(TAXA,BATCH,ASS,UNASS,AMPLICON,PR_NTDIFFc,ASSEMBLED)), by=c('TAXA','BATCH'))
drs2a <- merge(tmp, subset(tmp2,select=c(TAXA,BATCH,ASS,UNASS,AMPLICON,PR_NTDIFFc,ASSEMBLED)), by=c('TAXA','BATCH'))
drs3f <- merge(tmp, subset(drs3f,select=c(TAXA,BATCH,ASS,UNASS,AMPLICON,PR_NTDIFFc,ASSEMBLED)), by=c('TAXA','BATCH'))
drs12a <- rbind(drs2a, drs1r, use.names=TRUE)
drs23a <- rbind(drs2a, drs3f, use.names=TRUE)
drs2 <- subset(drs2a, select=c("ASS", "UNASS", "TAXA", "EXTRACT_ID", "PANGEA_ID", "BATCH", "BATCH2", "PR_NTDIFFc", "AMPLICON", "ASSEMBLED", "ART", "BATCHVL", "VL", "COHORT", "LOC", "COMM_NUM", "COMET_1F1R", "COMET_3F4F", "COMET_4F3R", "COMET_CONS"))
drs2t <- subset(drs2a, select=c("ASS", "UNASS", "TAXA", "EXTRACT_ID", "PANGEA_ID", "BATCH", "BATCH2", "PR_NTDIFFc", "AMPLICON","ASSEMBLED", "ART", "BATCHVL", "VL", "COHORT", "LOC", "COMM_NUM", "COMET_1F1R", "COMET_3F4F", "COMET_4F3R", "COMET_CONS", "SAMPLEDATEc", 'SAMPLEDATE'))
set(drs2t, NULL, 'SAMPLEDATEc', drs2t[, relevel(factor(SAMPLEDATEc), ref='2011.5')])
drs2npr <- subset(drs2, !is.na(LOC))
drs12 <- subset(drs12a, select=c("ASS", "UNASS", "TAXA", "EXTRACT_ID", "PANGEA_ID", "BATCH", "BATCH2", "PR_NTDIFFc", "AMPLICON", "ASSEMBLED", "ART", "BATCHVL", "VL", "COHORT", "LOC", "COMM_NUM", "COMET_1F1R", "COMET_3F4F", "COMET_4F3R", "COMET_CONS"))
drs12npr<- subset(drs12, !is.na(LOC))
drs12t <- subset(drs12a, select=c("ASS", "UNASS", "TAXA", "EXTRACT_ID", "PANGEA_ID", "BATCH", "BATCH2", "PR_NTDIFFc", "AMPLICON","ASSEMBLED", "ART", "BATCHVL", "VL", "COHORT", "LOC", "COMM_NUM", "COMET_1F1R", "COMET_3F4F", "COMET_4F3R", "COMET_CONS", "SAMPLEDATEc", 'SAMPLEDATE'))
set(drs12t, NULL, 'SAMPLEDATEc', drs12t[, relevel(factor(SAMPLEDATEc), ref='2011.5')])
drs23 <- subset(drs23a, select=c("ASS", "UNASS", "TAXA", "EXTRACT_ID", "PANGEA_ID", "BATCH", "BATCH2", "PR_NTDIFFc", "AMPLICON", "ASSEMBLED", "ART", "BATCHVL", "VL", "COHORT", "LOC", "COMM_NUM", "COMET_1F1R", "COMET_3F4F", "COMET_4F3R", "COMET_CONS"))
drs23npr<- subset(drs23, !is.na(LOC))
set(drs12, NULL, 'AMPLICON', drs12[, relevel(factor(AMPLICON), ref='one')])
set(drs12npr, NULL, 'AMPLICON', drs12npr[, relevel(factor(AMPLICON), ref='one')])
#
# look at the full region 1R-3F <60% vs >80%
#
m2.1 <- glm(data=drs2, UNASS ~ VL + COHORT + PR_NTDIFFc + ART + BATCHVL + BATCH , family='binomial')
m12.1 <- glm(data=drs12, UNASS ~ VL + COHORT + AMPLICON + PR_NTDIFFc + ART + BATCHVL + BATCH , family='binomial')
#m12.1 <- glm(data=drs12, UNASS ~ VL + COHORT + PR_NTDIFFc + AMPLICON + ART + BATCHVL + BATCH , family='binomial')
# sub-analyses to calculate AIC
m2.1b <- glm(data=drs2npr, UNASS ~ VL + COHORT + PR_NTDIFFc + ART + BATCHVL + BATCH, family='binomial')
m2.1c <- glm(data=subset(drs2npr, COMET_1F1R!='check' & VL!='No VL measured'), UNASS ~ VL + COHORT + PR_NTDIFFc + ART + BATCH, family='binomial')
# odds ratios
m2.1.or <- cbind(data.table(COEF=names(coef(m2.1))), as.data.table( exp(cbind(OR = coef(m2.1), confint(m2.1))) ) )
setnames(m2.1.or, c('2.5 %','97.5 %'), c('l95','u95'))
m12.1.or <- cbind(data.table(COEF=names(coef(m12.1))), as.data.table( exp(cbind(OR = coef(m12.1), confint(m12.1))) ) )
setnames(m12.1.or, c('2.5 %','97.5 %'), c('l95','u95'))
#subset(m2.1.or, l95>1)
#subset(m12.1.or, l95>1)
# exclude sig plates
sig.plates <- subset(m12.1.or, l95>1 & grepl('BATCH',COEF))[, as.integer(regmatches(COEF, regexpr('[0-9]+',COEF)))]
sig.plates <- c(sig.plates, 16033, 15934)
m2.1e <- glm(data=subset(drs2npr,!BATCH2%in%sig.plates), UNASS ~ VL + COHORT + PR_NTDIFFc + ART, family='binomial')
summary(m2.1e)
m2.1e.or <- cbind(data.table(COEF=names(coef(m2.1e))), as.data.table( exp(cbind(OR = coef(m2.1e), confint(m2.1e))) ) )
setnames(m2.1e.or, c('2.5 %','97.5 %'), c('l95','u95'))
# exclude also those that did not assemble
m2.1f <- glm(data=subset(drs2npr,!BATCH2%in%sig.plates & !grepl('unassembled',PR_NTDIFFc)), UNASS ~ VL + COHORT + PR_NTDIFFc + ART, family='binomial')
summary(m2.1f)
m2.1f.or <- cbind(data.table(COEF=names(coef(m2.1f))), as.data.table( exp(cbind(OR = coef(m2.1f), confint(m2.1f))) ) )
setnames(m2.1f.or, c('2.5 %','97.5 %'), c('l95','u95'))
#subset(m2.1e.or, l95>1)
# where from?
subset(dm, as.integer(gsub('_.*','',SANGER_ID))%in%sig.plates )
# I like this model in the end!
# - 2F comes out sig and not 2R as in the individual analysis
# - UG-MRC worse than RCCS, batches offer better explanation than study for RCCS
# - there is a nice series of batches that fail, from 15886 to 15964
# repeat with amplicon 1 included
m12.1e <- glm(data=subset(drs12npr,!BATCH2%in%sig.plates), UNASS ~ VL + COHORT + AMPLICON + PR_NTDIFFc + ART, family='binomial')
summary(m12.1e)
m12.1e.or <- cbind(data.table(COEF=names(coef(m12.1e))), as.data.table( exp(cbind(OR = coef(m12.1e), confint(m12.1e))) ) )
setnames(m12.1e.or, c('2.5 %','97.5 %'), c('l95','u95'))
#subset(m12.1e.or, l95>1)
#m12.1g <- glm(data=subset(drs12npr,!BATCH2%in%tmp), UNASS ~ VL + COHORT + AMPLICON:ASSEMBLED + ART, family='binomial')
#summary(m12.1g)
#m12.1g.or <- cbind(data.table(COEF=names(coef(m12.1g))), as.data.table( exp(cbind(OR = coef(m12.1g), confint(m12.1g))) ) )
#setnames(m12.1g.or, c('2.5 %','97.5 %'), c('l95','u95'))
#
# repeat with amplicon 3 included
m23.1e <- glm(data=subset(drs23npr,!BATCH2%in%sig.plates & !grepl('unassembled',PR_NTDIFFc)), UNASS ~ VL + COHORT + PR_NTDIFFc + ART, family='binomial')
summary(m23.1e)
m23.1e.or <- cbind(data.table(COEF=names(coef(m23.1e))), as.data.table( exp(cbind(OR = coef(m23.1e), confint(m23.1e))) ) )
setnames(m23.1e.or, c('2.5 %','97.5 %'), c('l95','u95'))
# - see if batch VL can be dropped..
m2.2 <- glm(data=drs2, UNASS ~ VL + COHORT + PR_NTDIFFc + ART + BATCH, family='binomial')
m2.2b <- glm(data=drs2npr, UNASS ~ VL + COHORT + PR_NTDIFFc + ART + BATCH, family='binomial')
summary(m2.2)
# Hmmm
# - the batch effect from 15892 to 15964 are not present any more
m2.3 <- glm(data=drs2npr, UNASS ~ VL + COHORT + PR_NTDIFFc + ART, family='binomial')
summary(m2.3)
# - without batches, RCCS worse than UG-MRC: batches offer better explanation than study for RCCS
# - 2R one mutation also not sig
m2.4 <- glm(data=drs2npr, UNASS ~ VL + PR_NTDIFFc + ART + COMM_NUM, family='binomial')
summary(m2.4)
#
# combined model communities + batches.
# just a few communities with borderline significant coefficients
# AIC worse than for batch models, but not terribly worse
# the problem is that the prior is deleted because communities are missing, and so there are singularities
m2.6 <- glm(data=drs2, UNASS ~ VL + COMM_NUM + PR_NTDIFFc + ART + BATCHVL + BATCH, family='binomial')
summary(m2.6)
#
# plot significant batches
#
tmp <- subset(drs2t, !grepl('PRIOR',TAXA))
tmp2 <- subset(m12.1.or, l95>1 & OR<=3 & grepl('BATCH',COEF))[, as.integer(regmatches(COEF, regexpr('[0-9]+',COEF)))]
tmp2 <- tmp[, which(BATCH2%in%tmp2)]
set(tmp, tmp2, 'BATCH', tmp[tmp2,paste(BATCH,' *',sep='')])
tmp2 <- subset(m12.1.or, l95>1 & OR>3 & OR<=10 & grepl('BATCH',COEF))[, as.integer(regmatches(COEF, regexpr('[0-9]+',COEF)))]
tmp2 <- tmp[, which(BATCH2%in%tmp2)]
set(tmp, tmp2, 'BATCH', tmp[tmp2,paste(BATCH,' **',sep='')])
tmp2 <- subset(m12.1.or, l95>1 & OR>10 & grepl('BATCH',COEF))[, as.integer(regmatches(COEF, regexpr('[0-9]+',COEF)))]
tmp2 <- tmp[, which(BATCH2%in%tmp2)]
set(tmp, tmp2, 'BATCH', tmp[tmp2,paste(BATCH,' ***',sep='')])
tmp2 <- tmp[, which(COHORT=='RCCS')]
set(tmp, tmp2, 'COMM_NUM', tmp[tmp2,paste('Rakai-',COMM_NUM,sep='')])
tmp2 <- tmp[, list(EXTRACT_MED=median(EXTRACT_ID)), by=c('BATCH')]
setkey(tmp2, EXTRACT_MED)
set(tmp2, NULL, 'BATCH3', tmp2[, factor(EXTRACT_MED, levels=EXTRACT_MED, labels=BATCH)])
tmp <- tmp[, list(N=length(TAXA), P=mean(UNASS==1), SD= ifelse(all(is.na(SAMPLEDATE)), -1L, as.integer(mean(SAMPLEDATE, na.rm=1)<2012.25))), by=c('BATCH','BATCH2','COMM_NUM')]
tmp <- merge(tmp, tmp2, by='BATCH')
tmp <- subset(tmp, !COMM_NUM%in%c('Rakai-RCCS','MRC'))
ggplot( tmp, aes(x=BATCH3, y=COMM_NUM, size=N, colour=P, pch=factor(SD, levels=c(-1,0,1),labels=c('Unknown','No','Yes')))) +
geom_point() +
scale_colour_gradient(low='blue', high='orange') +
theme_bw() + theme(axis.text.x=element_text(angle=90, vjust=1)) +
labs(x='sequencing plate (ordered by sample extraction at UCLH)', y='sampling location', colour='proportion of\nsamples with\n<20% assembled sites\nin 1R-3F', size='number of\nsamples', pch='Average\nsample date\nbefore 2012.25')
ggsave(file=file.path(wdir, 'PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment_batch_vs_community.pdf'), w=25, h=10, useDingbats=FALSE)
#
# sub analysis: is there an interaction with extraction ID?
#
if(0)
{
setkey(drs2npr, EXTRACT_ID)
drs2npr[, UNASS_ROLL_EXTRACT:= drs2npr[,rollapply(UNASS, width=20, FUN=mean, align="center", partial=TRUE)]]
tmp <- subset(m12.1.or, l95>1 & grepl('BATCH',COEF))[, as.integer(regmatches(COEF, regexpr('[0-9]+',COEF)))]
ggplot(drs2npr, aes(x=EXTRACT_ID, y=BATCH, fill=UNASS_ROLL_EXTRACT, pch=COHORT, colour=factor(BATCH2%in%tmp, levels=c(FALSE,TRUE),labels=c('N','Y')))) +
geom_point(pch=21, size=2, stroke=0.1) +
scale_x_continuous(breaks=seq(0,1e4,200)) +
scale_colour_manual(values=c('Y'='black','N'='transparent')) +
scale_fill_gradient(low='blue', high='orange') +
labs(x='\nUCLH extraction ID', y='Sanger plate\n', colour='significant plate effect', fill='rolling mean\nover 20 consecutive samples of\nsequence with >80% not assembled in 1R3F') +
theme_bw()
ggsave(file=file.path(wdir, 'PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment_batch_vs_extraction.pdf'), w=20, h=20)
ggplot(drs2npr, aes(x=EXTRACT_ID, y=UNASS_ROLL_EXTRACT, colour=COHORT)) +
geom_line() +
facet_grid(COHORT~.)
}
#
# sub analysis: is there an interaction between missingVL and cohort ID?
#
dvl <- copy(drs2npr)
tmp <- dvl[, which(VL=='No VL measured')]
set(dvl, tmp, 'VL', dvl[tmp, paste(VL,COHORT,sep='_')])
set(dvl, NULL, 'VL', dvl[, relevel(factor(as.character(VL)), ref='>1e5')])
m2.9 <- glm(data=subset(dvl,!BATCH2%in%sig.plates), UNASS ~ VL + COHORT + PR_NTDIFFc + ART, family='binomial')
summary(m2.9)
m2.9.or <- cbind(data.table(COEF=names(coef(m2.9))), as.data.table( exp(cbind(OR = coef(m2.9), confint(m2.9))) ) )
#
# sub analysis: does subtype explain plates and study effects?
# exclude no VL
m2.8 <- glm(data=subset(drs2npr, COMET_1F1R!='check' & VL!='No VL measured'), UNASS ~ VL + COMET_1F1R:COHORT + PR_NTDIFFc + ART, family='binomial')
# don t use above: there is a strong study effect, and this should not be attributed to
m2.8 <- glm(data=subset(drs2npr, COMET_1F1R!='check' & VL!='No VL measured'), UNASS ~ VL + COHORT + COMET_1F1R + PR_NTDIFFc + ART, family='binomial')
# with no VL excluded, there is only two cohorts and the model can adjust for study effect and subtype,
# because all BW sequs are C
# short not sig because the mutations are included
summary(m2.8)
sapply(list(m2.1c, m2.8), AIC) #AIC comparable but this excludes the odd batches.. ..so what s the point?
sapply(list(m2.1c, m2.8), BIC) #BIC prefers m2.8!
# sub on RCCS
# TODO: include MRC once we have viral loads
drs12st <- subset(drs12npr, COHORT%in%c('RCCS','UG-MRC','BW-Mochudi') & COMET_1F1R!='short' & !BATCH2%in%sig.plates)
set(drs12st, NULL, 'AMPLICON', drs12st[, relevel(factor(AMPLICON), ref='one')])
m2.5.1F1R <- glm(data=drs12st, UNASS ~ VL + COHORT + ART + AMPLICON + PR_NTDIFFc + COMET_1F1R, family='binomial')
m2.5.3F4F <- glm(data=subset(drs12npr, COHORT%in%c('RCCS','UG-MRC','BW-Mochudi') & COMET_3F4F!='short' & !BATCH2%in%sig.plates), UNASS ~ VL + COHORT + ART + AMPLICON + PR_NTDIFFc + COMET_3F4F, family='binomial')
m2.5.4F3R <- glm(data=subset(drs12npr, COHORT%in%c('RCCS','UG-MRC','BW-Mochudi') & COMET_4F3R!='short' & !BATCH2%in%sig.plates), UNASS ~ VL + COHORT + ART + AMPLICON + PR_NTDIFFc + COMET_4F3R, family='binomial')
m2.5.CONS <- glm(data=subset(drs12npr, COHORT%in%c('RCCS','UG-MRC','BW-Mochudi') & COMET_CONS!='short' & !BATCH2%in%sig.plates), UNASS ~ VL + COHORT + ART + AMPLICON + PR_NTDIFFc + COMET_CONS, family='binomial')
summary(m2.5.1F1R)
summary(m2.5.3F4F)
summary(m2.5.4F3R)
summary(m2.5.CONS)
tmp <- m2.5.1F1R
m2.5.1F1R.or<- cbind(data.table(COEF=names(coef(tmp))), as.data.table( exp(cbind(OR = coef(tmp), confint(tmp))) ) )
setnames(m2.5.1F1R.or, c('2.5 %','97.5 %'), c('l95','u95'))
tmp <- m2.5.3F4F
m2.5.3F4F.or<- cbind(data.table(COEF=names(coef(tmp))), as.data.table( exp(cbind(OR = coef(tmp), confint(tmp))) ) )
setnames(m2.5.3F4F.or, c('2.5 %','97.5 %'), c('l95','u95'))
tmp <- m2.5.4F3R
m2.5.4F3R.or<- cbind(data.table(COEF=names(coef(tmp))), as.data.table( exp(cbind(OR = coef(tmp), confint(tmp))) ) )
setnames(m2.5.4F3R.or, c('2.5 %','97.5 %'), c('l95','u95'))
tmp <- m2.5.CONS
m2.5.CONS.or<- cbind(data.table(COEF=names(coef(tmp))), as.data.table( exp(cbind(OR = coef(tmp), confint(tmp))) ) )
setnames(m2.5.CONS.or, c('2.5 %','97.5 %'), c('l95','u95'))
drs2stvl <- copy(drs2st)
tmp <- drs2stvl[, which(VL=='No VL measured')]
set(drs2stvl, tmp, 'VL', drs2stvl[tmp, paste(VL,COHORT,sep='_')])
set(drs2stvl, NULL, 'VL', drs2stvl[, relevel(factor(as.character(VL)), ref='>1e5')])
m2.10 <- glm(data=drs2stvl, UNASS ~ VL + COHORT + COMET_CONS + PR_NTDIFFc + ART, family='binomial')
summary(m2.10)
tmp <- m2.10
m2.10.CONS.or<- cbind(data.table(COEF=names(coef(tmp))), as.data.table( exp(cbind(OR = coef(tmp), confint(tmp))) ) )
setnames(m2.10.CONS.or, c('2.5 %','97.5 %'), c('l95','u95'))
#
# sub analysis Rakai: sample date
# sample date and batch confounded, should not include batch
# sort of overfitting. take out comm num?
# cool -- this now recovers the sample date effect!!
m2.7 <- glm(data=subset(drs2t, COHORT=='RCCS'), UNASS ~ VL + SAMPLEDATEc + PR_NTDIFFc + ART + ST, family='binomial')
summary(m2.7)
#
# compare AIC of batch model vs community model (must use same data)
# the batch models have lower AIC than the comm_num model
if(0)
{
sapply(list(m2.1b, m2.2b, m2.3, m2.4), AIC)
sapply(list(m2.1b, m2.2b, m2.3, m2.4), AIC)
}
#
# explore if I can use different link functions (log link to get RR)
# ... not successful
if(0)
{
p2.5 <- glm(data=subset(drs2npr, ST%in%c('A1','D')), UNASS ~ VL + COHORT + PR_NTDIFFc + ART + ST, family=poisson(link=log))
r2.5 <- glm(data=subset(drs2npr, ST%in%c('A1','D')), UNASS ~ VL + COHORT + PR_NTDIFFc + ART + ST, family=binomial(link=log), control=glm.control(epsilon=1e-8, maxit=100, trace=TRUE), start=c(-2.12,0.93, 0.96, 0.87, 0.72823123, 0.60499165, -0.02665851,0.46523009, 1.07472518, 0.35393831, 0.94179246, 0.03808863,0))
#qb2.5 <- glm(data=subset(drs2npr, ST%in%c('A1','D')), UNASS ~ VL + COHORT + PR_NTDIFFc + ART + ST, family=quasibinomial(link=log), start=c(-2.12,0.93, 0.96, 0.87, 0.72823123, 0.60499165, -0.02665851,0.46523009, 1.07472518, 0.35393831, 0.94179246, 0.03808863,0))
#p2.5 <- glm(data=subset(drs2npr, ST%in%c('A1','D')), UNASS ~ VL + COHORT + PR_NTDIFFc + ART + ST, family=poisson(link=log))
library(ResourceSelection)
hoslem.test(subset(drs2npr, ST%in%c('A1','D'))[, UNASS], fitted(m2.5))
hoslem.test(subset(drs2npr, ST%in%c('A1','D'))[, UNASS], fitted(p2.5))
hoslem.test(subset(drs2npr, ST%in%c('A1','D'))[, UNASS], fitted(r2.5))
sapply(list(m2.5, p2.5, r2.5), AIC) #the logit model also has smallest deviance and is easiest to fit
require(AICcmodavg)
sapply(list(m2.5, p2.5, r2.5), AICc) #this is basically the same
p2.2 <- glm(data=drs2, UNASS ~ VL + COHORT + PR_NTDIFFc + ART + BATCH, family=poisson(link=log))
# I don t think r2.2 has converged..
r2.2 <- glm(data=drs2, UNASS ~ VL + COHORT + PR_NTDIFFc + ART + BATCH, family=binomial(link=log), control=glm.control(epsilon=1e-8, maxit=1e3, trace=FALSE),
start=c(-2.41272163456974, 0.557456598211733, 0.486523777039746, 0.385699510360356, 0.319142195398426, 0.875550188558768, 1.03022591978234, 0.513438150979493, rep(0, 114)))
}
if(0)
{
subset(drs2a, !grepl('PRIOR',TAXA))[, list(N_ASS= sum(ASS), N_UNASS=sum(1-ASS), P_UNASS=round(sum(1-ASS)/length(ASS), d=2)), by='BATCH']
subset(drs2a, !grepl('PRIOR',TAXA))[, list(N_ASS= sum(ASS), N_UNASS=sum(1-ASS), P_UNASS=round(sum(1-ASS)/length(ASS), d=2)), by='COMM_NUM']
subset(drs2a, !grepl('PRIOR',TAXA))[, list(N_ASS= sum(ASS), N_UNASS=sum(1-ASS), P_UNASS=round(sum(1-ASS)/length(ASS), d=2)), by='COMET_Region1']
subset(drs2a, !grepl('PRIOR',TAXA))[, table(COMM_NUM, floor(SAMPLEDATE))]
}
#
# write odds ratio tables
#
tmp <- copy(m12.1e.or)
set(tmp, NULL, 'OR', tmp[, round(OR,d=2)])
set(tmp, NULL, 'CI', tmp[, paste(round(l95,d=2),'-',round(u95,d=2),sep='')])
write.csv(subset(tmp, !is.na(OR), select=c(COEF, OR, CI)), row.names=FALSE, file=file.path(wdir, 'PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment_regression_model12.1e.csv'))
tmp <- copy(m2.1e.or)
set(tmp, NULL, 'OR', tmp[, round(OR,d=2)])
set(tmp, NULL, 'CI', tmp[, paste(round(l95,d=2),'-',round(u95,d=2),sep='')])
write.csv(subset(tmp, !is.na(OR), select=c(COEF, OR, CI)), row.names=FALSE, file=file.path(wdir, 'PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment_regression_model2.1.csv'))
tmp <- copy(m2.5.1F1R.or)
set(tmp, NULL, 'OR', tmp[, round(OR,d=2)])
set(tmp, NULL, 'CI', tmp[, paste(round(l95,d=2),'-',round(u95,d=2),sep='')])
write.csv(subset(tmp, !is.na(OR), select=c(COEF, OR, CI)), row.names=FALSE, file=file.path(wdir, 'PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment_regression_model2.5.1F1R.or.csv'))
tmp <- copy(m2.5.3F4F.or)
set(tmp, NULL, 'OR', tmp[, round(OR,d=2)])
set(tmp, NULL, 'CI', tmp[, paste(round(l95,d=2),'-',round(u95,d=2),sep='')])
write.csv(subset(tmp, !is.na(OR), select=c(COEF, OR, CI)), row.names=FALSE, file=file.path(wdir, 'PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment_regression_model2.5.3F4F.csv'))
tmp <- copy(m2.5.4F3R.or)
set(tmp, NULL, 'OR', tmp[, round(OR,d=2)])
set(tmp, NULL, 'CI', tmp[, paste(round(l95,d=2),'-',round(u95,d=2),sep='')])
write.csv(subset(tmp, !is.na(OR), select=c(COEF, OR, CI)), row.names=FALSE, file=file.path(wdir, 'PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment_regression_model2.5.4F3R.csv'))
tmp <- copy(m2.5.CONS.or)
set(tmp, NULL, 'OR', tmp[, round(OR,d=2)])
set(tmp, NULL, 'CI', tmp[, paste(round(l95,d=2),'-',round(u95,d=2),sep='')])
write.csv(subset(tmp, !is.na(OR), select=c(COEF, OR, CI)), row.names=FALSE, file=file.path(wdir, 'PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment_regression_model2.5.CONS.csv'))
#
# run batches - subanalysis
#
drs2b <- copy(drs2a)
tmp2 <- unique(subset(drs2b, BATCH2>=15892 & BATCH2<=15964 & !is.na(COMM_NUM), COMM_NUM), by='COMM_NUM')
tmp2 <- unique(subset(merge(drs2b, tmp2, by='COMM_NUM'), select=BATCH2), by='BATCH2')
drs2b <- merge(drs2b, tmp2, by='BATCH2')
drs2b[, UNASSB:= 0L]
set(drs2b, drs2b[, which(BATCH2>=15892 & BATCH2<=15964 & !is.na(COMM_NUM))], 'UNASSB', 1L)
drs2b <- subset(drs2b, !grepl('PRIOR',TAXA), select=c("UNASSB", "TAXA", "PANGEA_ID", "BATCH", "PR_NTDIFFc", "ART", "BATCHVL", "VL", "COHORT", "LOC", "COMM_NUM", "ST", "SAMPLEDATE", "SAMPLEDATEc"))
set(drs2b, NULL, 'SAMPLEDATEc', drs2b[, relevel(factor(SAMPLEDATEc), ref='2011.75')])
write.csv(drs2b, row.names=FALSE, file=file.path(wdir, 'PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment_batch_vs_community.csv'))
s2.1 <- glm(data=subset(drs2b, !is.na(SAMPLEDATE)), UNASSB ~ VL + COMM_NUM + PR_NTDIFFc + ART + ST + SAMPLEDATE, family='binomial')
summary(s2.1)
s2.2 <- glm(data=subset(drs2b, !is.na(SAMPLEDATE)), UNASSB ~ VL + PR_NTDIFFc + ART + ST + SAMPLEDATEc, family='binomial')
summary(s2.2)
#
# do the predictions, use model m2.1
#
# exclude singular batches to get OK prediction
#
# data for predictions
tmp <- subset(m12.1.or, l95>1.05 & grepl('BATCH',COEF))[, as.integer(regmatches(COEF, regexpr('[0-9]+',COEF)))]
dpr <- subset(drs2npr, COHORT%in%c('RCCS','UG-MRC','BW-Mochudi') & COMET_CONS!='short' & !BATCH2%in%tmp)
# higher viral load
tmp <- subset(dpr, VL!='No VL measured')
pr0 <- predict(m2.5.CONS, newdata=tmp, type='response', se.fit=TRUE)
set(tmp, tmp[, which(VL%in%c('<1e4'))], 'VL', '1e4-2e4')
pr1 <- predict(m2.5.CONS, newdata=tmp, type='response', se.fit=TRUE)
set(tmp, tmp[, which(VL%in%c('<1e4', '1e4-2e4'))], 'VL', '2e4-4e4')
pr2 <- predict(m2.5.CONS, newdata=tmp, type='response', se.fit=TRUE)
set(tmp, tmp[, which(VL%in%c('<1e4', '1e4-2e4', '2e4-4e4'))], 'VL', '4e4-1e5')
pr3 <- predict(m2.5.CONS, newdata=tmp, type='response', se.fit=TRUE)
set(tmp, tmp[, which(VL%in%c('<1e4', '1e4-2e4', '2e4-4e4','4e4-1e5'))], 'VL', '>1e5')
pr4 <- predict(m2.5.CONS, newdata=tmp, type='response', se.fit=TRUE)
tmp[, H0u_p:= UNASS]
tmp[, H0all_p:= 1]
tmp[, H0_p:= pr0$fit]
tmp[, H0_l:= pr0$fit-1.96*pr0$se.fit]
tmp[, H0_u:= pr0$fit+1.96*pr0$se.fit]
tmp[, H1_p:= pr1$fit]
tmp[, H1_l:= pr1$fit-1.96*pr1$se.fit]
tmp[, H1_u:= pr1$fit+1.96*pr1$se.fit]
tmp[, H2_p:= pr2$fit]
tmp[, H2_l:= pr2$fit-1.96*pr2$se.fit]
tmp[, H2_u:= pr2$fit+1.96*pr2$se.fit]
tmp[, H3_p:= pr3$fit]
tmp[, H3_l:= pr3$fit-1.96*pr3$se.fit]
tmp[, H3_u:= pr3$fit+1.96*pr3$se.fit]
tmp[, H4_p:= pr4$fit]
tmp[, H4_l:= pr4$fit-1.96*pr4$se.fit]
tmp[, H4_u:= pr4$fit+1.96*pr4$se.fit]
tmp <- melt(tmp, id.vars=c('COHORT'), measure.vars=colnames(tmp)[grepl('^H[0-9]+',colnames(tmp))])
set(tmp, tmp[, which(value<0)], 'value', 0)
set(tmp, tmp[, which(value>1)], 'value', 1)
set(tmp, NULL, 'TYPE', tmp[, gsub('_','',regmatches(variable, regexpr('_[a-z]', variable)))])
set(tmp, NULL, 'variable', tmp[, regmatches(variable, regexpr('[A-Za-z0-9]+', variable))])
tmp <- tmp[, list(value=sum(value)), by=c('COHORT','TYPE','variable')]
tmp2 <- subset(tmp, variable=='H0' & TYPE=='p')
setnames(tmp2, 'value', 'BL')
tmp <- merge(tmp, subset(tmp2, select=c(COHORT, BL)), by='COHORT')
tmp2 <- subset(tmp, variable=='H0all')
setnames(tmp2, 'value', 'TOTAL')
tmp <- merge(tmp, subset(tmp2, select=c(COHORT, TOTAL)), by='COHORT')
tmp <- subset(tmp, variable!='H0u' & variable!='H0all' & TYPE=='p')
tmp[, REDUCTION_N:= BL-value]
tmp[, REDUCTION_P:= (BL-value)/TOTAL]
tmp <- subset(merge(tmp, data.table(variable=c('H1','H2','H3','H4'), LABEL=c('VL >1e4','VL >2e4','VL >4e4','VL >1e5')), by='variable'), select=c(COHORT, LABEL, REDUCTION_N, REDUCTION_P, TOTAL))
ans <- copy(tmp)
#
# primer sites assembled
tmp <- copy(dpr)
pr0 <- predict(m2.5.CONS, newdata=tmp, type='response', se.fit=TRUE)
set(tmp, tmp[, which(PR_NTDIFFc%in%c('2F or 2R unassembled'))], 'PR_NTDIFFc', '0')
pr1 <- predict(m2.5.CONS, newdata=tmp, type='response', se.fit=TRUE)
tmp[, H0u_p:= UNASS]
tmp[, H0all_p:= 1]
tmp[, H0_p:= pr0$fit]
tmp[, H0_l:= pr0$fit-1.96*pr0$se.fit]
tmp[, H0_u:= pr0$fit+1.96*pr0$se.fit]
tmp[, H1_p:= pr1$fit]
tmp[, H1_l:= pr1$fit-1.96*pr1$se.fit]
tmp[, H1_u:= pr1$fit+1.96*pr1$se.fit]
tmp <- melt(tmp, id.vars=c('COHORT'), measure.vars=colnames(tmp)[grepl('^H[0-9]+',colnames(tmp))])
set(tmp, tmp[, which(value<0)], 'value', 0)
set(tmp, tmp[, which(value>1)], 'value', 1)
set(tmp, NULL, 'TYPE', tmp[, gsub('_','',regmatches(variable, regexpr('_[a-z]', variable)))])
set(tmp, NULL, 'variable', tmp[, regmatches(variable, regexpr('[A-Za-z0-9]+', variable))])
tmp <- tmp[, list(value=sum(value)), by=c('COHORT','TYPE','variable')]
tmp2 <- subset(tmp, variable=='H0' & TYPE=='p')
setnames(tmp2, 'value', 'BL')
tmp <- merge(tmp, subset(tmp2, select=c(COHORT, BL)), by='COHORT')
tmp2 <- subset(tmp, variable=='H0all')
setnames(tmp2, 'value', 'TOTAL')
tmp <- merge(tmp, subset(tmp2, select=c(COHORT, TOTAL)), by='COHORT')
tmp <- subset(tmp, variable!='H0u' & variable!='H0all' & TYPE=='p')
tmp[, REDUCTION_N:= BL-value]
tmp[, REDUCTION_P:= (BL-value)/TOTAL]
tmp <- subset(merge(tmp, data.table(variable=c('H1'), LABEL=c('Assembled primer sites')), by='variable'), select=c(COHORT, LABEL, REDUCTION_N, REDUCTION_P, TOTAL))
ans <- rbind(ans, tmp)
#
# no mutations
tmp <- subset(dpr, PR_NTDIFFc!='2F or 2R unassembled')
pr0 <- predict(m2.5.CONS, newdata=tmp, type='response', se.fit=TRUE)
set(tmp, tmp[, which(PR_NTDIFFc%in%c('2F at least one mutation'))], 'PR_NTDIFFc', '0')
pr1 <- predict(m2.5.CONS, newdata=tmp, type='response', se.fit=TRUE)
set(tmp, tmp[, which(PR_NTDIFFc%in%c('2F at least one mutation','2R at least one mutation, 2F no mutation'))], 'PR_NTDIFFc', '0')
pr2 <- predict(m2.5.CONS, newdata=tmp, type='response', se.fit=TRUE)
tmp[, H0u_p:= UNASS]
tmp[, H0all_p:= 1]
tmp[, H0_p:= pr0$fit]
tmp[, H0_l:= pr0$fit-1.96*pr0$se.fit]
tmp[, H0_u:= pr0$fit+1.96*pr0$se.fit]
tmp[, H1_p:= pr1$fit]
tmp[, H1_l:= pr1$fit-1.96*pr1$se.fit]
tmp[, H1_u:= pr1$fit+1.96*pr1$se.fit]
tmp[, H2_p:= pr2$fit]
tmp[, H2_l:= pr2$fit-1.96*pr2$se.fit]
tmp[, H2_u:= pr2$fit+1.96*pr2$se.fit]
tmp <- melt(tmp, id.vars=c('COHORT'), measure.vars=colnames(tmp)[grepl('^H[0-9]+',colnames(tmp))])
set(tmp, tmp[, which(value<0)], 'value', 0)
set(tmp, tmp[, which(value>1)], 'value', 1)
set(tmp, NULL, 'TYPE', tmp[, gsub('_','',regmatches(variable, regexpr('_[a-z]', variable)))])
set(tmp, NULL, 'variable', tmp[, regmatches(variable, regexpr('[A-Za-z0-9]+', variable))])
tmp <- tmp[, list(value=sum(value)), by=c('COHORT','TYPE','variable')]
tmp2 <- subset(tmp, variable=='H0' & TYPE=='p')
setnames(tmp2, 'value', 'BL')
tmp <- merge(tmp, subset(tmp2, select=c(COHORT, BL)), by='COHORT')
tmp2 <- subset(tmp, variable=='H0all')
setnames(tmp2, 'value', 'TOTAL')
tmp <- merge(tmp, subset(tmp2, select=c(COHORT, TOTAL)), by='COHORT')
tmp <- subset(tmp, variable!='H0u' & variable!='H0all' & TYPE=='p')
tmp[, REDUCTION_N:= BL-value]
tmp[, REDUCTION_P:= (BL-value)/TOTAL]
tmp <- subset(merge(tmp, data.table(variable=c('H1','H2'), LABEL=c('PR 2F no mutation','PR 2F, 2R no mutation')), by='variable'), select=c(COHORT, LABEL, REDUCTION_N, REDUCTION_P, TOTAL))
ans <- rbind(ans, tmp)
#
# subtype (from sub analysis)
tmp <- copy(dpr)
pr0 <- predict(m2.5.CONS, newdata=tmp, type='response', se.fit=TRUE)
set(tmp, tmp[, which(COMET_CONS%in%c('D'))], 'COMET_CONS', 'A1')
pr1 <- predict(m2.5.CONS, newdata=tmp, type='response', se.fit=TRUE)
set(tmp, tmp[, which(COMET_CONS%in%c('D','C'))], 'COMET_CONS', 'A1')
pr2 <- predict(m2.5.CONS, newdata=tmp, type='response', se.fit=TRUE)
set(tmp, tmp[, which(COMET_CONS%in%c('D','C','pot_recombinant'))], 'COMET_CONS', 'A1')
pr3 <- predict(m2.5.CONS, newdata=tmp, type='response', se.fit=TRUE)
set(tmp, tmp[, which(COMET_CONS%in%c('D','C','other','pot_recombinant'))], 'COMET_CONS', 'A1')
pr4 <- predict(m2.5.CONS, newdata=tmp, type='response', se.fit=TRUE)
tmp[, H0u_p:= UNASS]
tmp[, H0all_p:= 1]
tmp[, H0_p:= pr0$fit]
tmp[, H0_l:= pr0$fit-1.96*pr0$se.fit]
tmp[, H0_u:= pr0$fit+1.96*pr0$se.fit]
tmp[, H1_p:= pr1$fit]
tmp[, H1_l:= pr1$fit-1.96*pr1$se.fit]
tmp[, H1_u:= pr1$fit+1.96*pr1$se.fit]
tmp[, H2_p:= pr2$fit]
tmp[, H2_l:= pr2$fit-1.96*pr2$se.fit]
tmp[, H2_u:= pr2$fit+1.96*pr2$se.fit]
tmp[, H3_p:= pr3$fit]
tmp[, H3_l:= pr3$fit-1.96*pr3$se.fit]
tmp[, H3_u:= pr3$fit+1.96*pr3$se.fit]
tmp[, H4_p:= pr4$fit]
tmp[, H4_l:= pr4$fit-1.96*pr4$se.fit]
tmp[, H4_u:= pr4$fit+1.96*pr4$se.fit]
tmp <- melt(tmp, id.vars=c('COHORT'), measure.vars=colnames(tmp)[grepl('^H[0-9]+',colnames(tmp))])
set(tmp, tmp[, which(value<0)], 'value', 0)
set(tmp, tmp[, which(value>1)], 'value', 1)
set(tmp, NULL, 'TYPE', tmp[, gsub('_','',regmatches(variable, regexpr('_[a-z]', variable)))])
set(tmp, NULL, 'variable', tmp[, regmatches(variable, regexpr('[A-Za-z0-9]+', variable))])
tmp <- tmp[, list(value=sum(value)), by=c('COHORT','TYPE','variable')]
tmp2 <- subset(tmp, variable=='H0' & TYPE=='p')
setnames(tmp2, 'value', 'BL')
tmp <- merge(tmp, subset(tmp2, select=c(COHORT, BL)), by='COHORT')
tmp2 <- subset(tmp, variable=='H0all')
setnames(tmp2, 'value', 'TOTAL')
tmp <- merge(tmp, subset(tmp2, select=c(COHORT, TOTAL)), by='COHORT')
tmp <- subset(tmp, variable!='H0u' & variable!='H0all' & TYPE=='p')
tmp[, REDUCTION_N:= BL-value]
tmp[, REDUCTION_P:= (BL-value)/TOTAL]
tmp <- subset(merge(tmp, data.table(variable=c('H1','H2','H3','H4'), LABEL=c('D','D,C','D,C,pot recombinant','D,C,other,pot recomb')), by='variable'), select=c(COHORT, LABEL, REDUCTION_N, REDUCTION_P, TOTAL))
ans <- rbind(ans, tmp)
#
# no ART self report
tmp <- copy(dpr)
pr0 <- predict(m2.5.CONS, newdata=tmp, type='response', se.fit=TRUE)
set(tmp, tmp[, which(ART%in%c('ART started or self reported'))], 'ART', 'no ART')
pr1 <- predict(m2.5.CONS, newdata=tmp, type='response', se.fit=TRUE)
tmp[, H0u_p:= UNASS]
tmp[, H0all_p:= 1]
tmp[, H0_p:= pr0$fit]
tmp[, H0_l:= pr0$fit-1.96*pr0$se.fit]
tmp[, H0_u:= pr0$fit+1.96*pr0$se.fit]
tmp[, H1_p:= pr1$fit]
tmp[, H1_l:= pr1$fit-1.96*pr1$se.fit]
tmp[, H1_u:= pr1$fit+1.96*pr1$se.fit]
tmp <- melt(tmp, id.vars=c('COHORT'), measure.vars=colnames(tmp)[grepl('^H[0-9]+',colnames(tmp))])
set(tmp, tmp[, which(value<0)], 'value', 0)
set(tmp, tmp[, which(value>1)], 'value', 1)
set(tmp, NULL, 'TYPE', tmp[, gsub('_','',regmatches(variable, regexpr('_[a-z]', variable)))])
set(tmp, NULL, 'variable', tmp[, regmatches(variable, regexpr('[A-Za-z0-9]+', variable))])
tmp <- tmp[, list(value=sum(value)), by=c('COHORT','TYPE','variable')]
tmp2 <- subset(tmp, variable=='H0' & TYPE=='p')
setnames(tmp2, 'value', 'BL')
tmp <- merge(tmp, subset(tmp2, select=c(COHORT, BL)), by='COHORT')
tmp2 <- subset(tmp, variable=='H0all')
setnames(tmp2, 'value', 'TOTAL')
tmp <- merge(tmp, subset(tmp2, select=c(COHORT, TOTAL)), by='COHORT')
tmp <- subset(tmp, variable!='H0u' & variable!='H0all' & TYPE=='p')
tmp[, REDUCTION_N:= BL-value]
tmp[, REDUCTION_P:= (BL-value)/TOTAL]
tmp <- subset(merge(tmp, data.table(variable=c('H1'), LABEL=c('ART not self-reported or started')), by='variable'), select=c(COHORT, LABEL, REDUCTION_N, REDUCTION_P, TOTAL))
ans <- rbind(ans, tmp)
#
# RCCS batches as typical UG-MRC 15034 (UG-MRC)
# bad batches as typical from same region 15699 (RCCS)
tmp <- subset(drs2npr, COHORT=='RCCS')
pr0 <- predict(m2.1, newdata=tmp, type='response', se.fit=TRUE)
tmp2 <- subset(m12.1.or, l95>1 & grepl('BATCH',COEF))[, gsub('BATCH','',COEF)]
set(tmp, tmp[, which(BATCH%in%tmp2)], 'BATCH', '15699 (RCCS)')
pr1 <- predict(m2.1, newdata=tmp, type='response', se.fit=TRUE)
tmp2 <- subset(subset(drs2a, !grepl('PRIOR',TAXA) & grepl('RCCS',BATCH))[, list(P_UNASS=round(sum(1-ASS)/length(ASS), d=2)), by='BATCH'], P_UNASS>0.38)[, BATCH]
set(tmp, tmp[, which(BATCH%in%tmp2)], 'BATCH', '15034 (UG-MRC)')
pr2 <- predict(m2.1, newdata=tmp, type='response', se.fit=TRUE)
tmp[, H0u_p:= UNASS]
tmp[, H0all_p:= 1]
tmp[, H0_p:= pr0$fit]
tmp[, H0_l:= pr0$fit-1.96*pr0$se.fit]
tmp[, H0_u:= pr0$fit+1.96*pr0$se.fit]
tmp[, H1_p:= pr1$fit]
tmp[, H1_l:= pr1$fit-1.96*pr1$se.fit]
tmp[, H1_u:= pr1$fit+1.96*pr1$se.fit]
tmp[, H2_p:= pr2$fit]
tmp[, H2_l:= pr2$fit-1.96*pr2$se.fit]
tmp[, H2_u:= pr2$fit+1.96*pr2$se.fit]
tmp <- melt(tmp, id.vars=c('COHORT'), measure.vars=colnames(tmp)[grepl('^H[0-9]+',colnames(tmp))])
set(tmp, tmp[, which(value<0)], 'value', 0)
set(tmp, tmp[, which(value>1)], 'value', 1)
set(tmp, NULL, 'TYPE', tmp[, gsub('_','',regmatches(variable, regexpr('_[a-z]', variable)))])
set(tmp, NULL, 'variable', tmp[, regmatches(variable, regexpr('[A-Za-z0-9]+', variable))])
tmp <- tmp[, list(value=sum(value)), by=c('COHORT','TYPE','variable')]
tmp2 <- subset(tmp, variable=='H0' & TYPE=='p')
setnames(tmp2, 'value', 'BL')
tmp <- merge(tmp, subset(tmp2, select=c(COHORT, BL)), by='COHORT')
tmp2 <- subset(tmp, variable=='H0all')
setnames(tmp2, 'value', 'TOTAL')
tmp <- merge(tmp, subset(tmp2, select=c(COHORT, TOTAL)), by='COHORT')
tmp <- subset(tmp, variable!='H0u' & variable!='H0all' & TYPE=='p')
tmp[, REDUCTION_N:= BL-value]
tmp[, REDUCTION_P:= (BL-value)/TOTAL]
tmp <- subset(merge(tmp, data.table(variable=c('H1','H2'), LABEL=c('no sig plates','as avg UG MRC')), by='variable'), select=c(COHORT, LABEL, REDUCTION_N, REDUCTION_P, TOTAL))
ans <- rbind(ans, tmp)
#
# write csv
set(ans, NULL, 'REDUCTION_P_L', ans[, paste(round(REDUCTION_P*100,d=1),'pc',sep='')])
#ans[, paste(unique(LABEL),collapse='", "')]
#set(ans, NULL, 'LABEL', ans[, factor(LABEL, levels=c("Assembled primer sites", "VL >1e4", "VL >2e4", "VL >4e4", "VL >1e5", "D", "Batch run as typical run from same comm", "PR 2F no mutation", "PR 2F, 2R no mutation", "ART not self-reported or started"))])
ans <- dcast.data.table(ans, LABEL~COHORT, value.var='REDUCTION_P_L')
write.csv(ans, row.names=FALSE, file=file.path(wdir, 'PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment_percentreductions.csv'))
#
# save
#
save(drs2, m2.1, m2.2, m2.3, m2.1.or, file=file.path(wdir, gsub('.rda','_logistic_160905.rda',wfile)))
tmp <- copy(m2.1.or)
set(tmp, NULL, 'COEF', tmp[, gsub('COHORT','',gsub('PR_NTDIFFc','',gsub('VL','viral load ',gsub('BATCH','plate ',COEF))))])
set(tmp, NULL, 'OR', tmp[, round(OR,d=2)])
set(tmp, NULL, 'LABEL', tmp[, as.character(OR)])
set(tmp, NULL, 'l95', tmp[, round(l95,d=2)])
set(tmp, NULL, 'u95', tmp[, round(u95,d=2)])
set(tmp, NULL, 'LABEL2', tmp[, paste(l95,'-',u95,sep='')])
tmp2 <- tmp[, which(l95>1 & l95<=3)]
set(tmp, tmp2, 'LABEL', tmp[tmp2,paste(LABEL,' *',sep='')])
tmp2 <- tmp[, which(l95>3)]
set(tmp, tmp2, 'LABEL', tmp[tmp2,paste(LABEL,' **',sep='')])
write.csv(subset(tmp, !is.na(OR), select=c(COEF,LABEL,LABEL2)), row.names=FALSE, file=file.path(wdir, gsub('.rda','_logistic_160905.csv',wfile)))
#
# old stuff
#
#
# what are these batches?
#
set(drs2a, NULL, 'BATCH', drs2a[, as.integer(regmatches(BATCH,regexpr('[0-9]+',BATCH)))])
tmp2 <- subset(m2.1.or, grepl('BATCH',COEF) & grepl('(RCCS)',COEF,fixed=1) & l95>1)[, as.integer(regmatches(COEF,regexpr('[0-9]+',COEF)))]
db <- subset(drs2a, BATCH %in% tmp2)
db[, BATCH_UNASS:='Y']
tmp <- subset(drs2a, COHORT=='RCCS' & !BATCH%in%tmp)
tmp[, BATCH_UNASS:='N']
db <- rbind(db,tmp)
# compare first locations ( % of samples from... )
tmp <- db[, {
#z <- subset(db, BATCH_UNASS=='Y')[,table(as.character(COMM_NUM))]
z <- table(as.character(COMM_NUM))
ans <- as.data.table(binconf(z, sum(z)))
ans[, TYPE:=names(z)]
setnames(ans, c('PointEst','Lower','Upper'),c('central','l95','u95'))
ans <- melt(ans, id.vars='TYPE')
set(ans, NULL, 'value', ans[,round(value*100,d=1)])
ans
}, by='BATCH_UNASS']
dcast.data.table(tmp, TYPE~BATCH_UNASS+variable)
#
# batch models with BATCH VL
#
m.1 <- glm(data=drs, ASS ~ VL + COHORT + PR_NTDIFFc + ART + BATCHVL + BATCH, family='binomial')
summary(m.1)
# batch VL not significant, drop..
m.2 <- glm(data=drs, ASS ~ BATCH + VL + COHORT + PR_NTDIFFc + ART - 1, family='binomial')
summary(m.2)
# with batches, UG-MRC worse than RCCS + PR_NTDIFFc2F/2R mut significant, but also PR_NTDIFFc2R0 significant (simply knock on)
m.3 <- glm(data=drs, ASS ~ VL + COHORT + PR_NTDIFFc + ART, family='binomial')
summary(m.3)
# without batches, RCCS worse than UG-MRC: batches offer better explanation than study for RCCS
m.2.or<- cbind(data.table(COEF=names(coef(m.2))), as.data.table( exp(cbind(OR = coef(m.2), confint(m.2))) ) )
setnames(m.2.or, c('2.5 %','97.5 %'), c('l95','u95'))
subset(m.2.or, u95<0.95)
#
# batch models with BATCH VL
#
m2f.1 <- glm(data=drs2f, ASS ~ VL + COHORT + ART + NT_DIFFc + BATCHVL + BATCH, family='binomial')
summary(m2f.1)
m2f.1.or<- cbind(data.table(COEF=names(coef(m2f.1))), as.data.table( exp(cbind(OR = coef(m2f.1), confint(m2f.1))) ) )
setnames(m2f.1.or, c('2.5 %','97.5 %'), c('l95','u95'))
m2r.1 <- glm(data=drs2r, ASS ~ VL + COHORT + ART + NT_DIFF_2Rc + BATCHVL + BATCH, family='binomial')
summary(m2r.1)
m2r.1.or<- cbind(data.table(COEF=names(coef(m2r.1))), as.data.table( exp(cbind(OR = coef(m2r.1), confint(m2r.1))) ) )
setnames(m2r.1.or, c('2.5 %','97.5 %'), c('l95','u95'))
#
# batch models without BATCH VL
#
m2f.2 <- glm(data=drs2f, ASS ~ VL + COHORT + ART + NT_DIFF_2Fc + BATCH, family='binomial')
summary(m2f.2)
m2f.2.or<- cbind(data.table(COEF=names(coef(m2f.2))), as.data.table( exp(cbind(OR = coef(m2f.2), confint(m2f.2))) ) )
setnames(m2f.2.or, c('2.5 %','97.5 %'), c('l95','u95'))
m2r.2 <- glm(data=drs2r, ASS ~ VL + COHORT + ART + NT_DIFF_2Rc + BATCH, family='binomial')
summary(m2r.2)
m2r.2.or<- cbind(data.table(COEF=names(coef(m2r.2))), as.data.table( exp(cbind(OR = coef(m2r.2), confint(m2r.2))) ) )
setnames(m2r.2.or, c('2.5 %','97.5 %'), c('l95','u95'))
#
# no BATCHES
#
m2f.3 <- glm(data=drs2f, ASS ~ VL + COHORT + ART + NT_DIFF_2Fc, family='binomial')
summary(m2f.3)
m2f.3.or<- cbind(data.table(COEF=names(coef(m2f.3))), as.data.table( exp(cbind(OR = coef(m2f.3), confint(m2f.3))) ) )
setnames(m2f.3.or, c('2.5 %','97.5 %'), c('l95','u95'))
m2r.3 <- glm(data=drs2r, ASS ~ VL + COHORT + ART + NT_DIFF_2Rc, family='binomial')
summary(m2r.3)
m2r.3.or<- cbind(data.table(COEF=names(coef(m2r.3))), as.data.table( exp(cbind(OR = coef(m2r.3), confint(m2r.3))) ) )
setnames(m2r.3.or, c('2.5 %','97.5 %'), c('l95','u95'))
#
# no BATCHES and subtype
#
m2f.4 <- glm(data=drs2f, ASS ~ VL + COHORT + ART + ST + NT_DIFF_2Fc, family='binomial')
summary(m2f.4)
m2f.4.or<- cbind(data.table(COEF=names(coef(m2f.4))), as.data.table( exp(cbind(OR = coef(m2f.4), confint(m2f.4))) ) )
setnames(m2f.4.or, c('2.5 %','97.5 %'), c('l95','u95'))
m2r.4 <- glm(data=drs2r, ASS ~ VL + COHORT + ART + ST + NT_DIFF_2Rc, family='binomial')
summary(m2r.4)
m2r.4.or<- cbind(data.table(COEF=names(coef(m2r.4))), as.data.table( exp(cbind(OR = coef(m2r.4), confint(m2r.4))) ) )
setnames(m2r.4.or, c('2.5 %','97.5 %'), c('l95','u95'))
#
# NT_DIFF_2Fcat least one mutation in models 3,4 and stronger when batches included
# NT_DIFF_2Rcat least one mutation not sig
#
# NT_DIFF_2FcUnassembled and NT_DIFF_2RcUnassembled always sig
#
# STD and STB or C significant for 2F
# compare significant batches
tmp <- subset(m2f.1.or, u95<0.95)
tmp[, PR:='2F']
tmp[, MODEL:= 'adjusting for avg VL in batch']
tmp2 <- subset(m2r.1.or, u95<0.95)
tmp2[, PR:='2R']
tmp2[, MODEL:= 'adjusting for avg VL in batch']
tmp <- rbind(tmp, tmp2)
tmp2 <- subset(m2f.2.or, u95<0.95)
tmp2[, PR:='2F']
tmp2[, MODEL:= 'not adjusting for avg VL in batch']
tmp <- rbind(tmp, tmp2)
tmp2 <- subset(m2r.2.or, u95<0.95)
tmp2[, PR:='2R']
tmp2[, MODEL:= 'not adjusting for avg VL in batch']
tmp <- rbind(tmp, tmp2)
tmp <- subset(tmp, grepl('BATCH', COEF) & COEF!='BATCHNo matched ID')
set(tmp, NULL, 'BATCH', tmp[,gsub('BATCH','',COEF)])
tmp <- merge(tmp, unique(subset(dr, select=c(BATCH, COHORT)), by=c('BATCH','COHORT')), by='BATCH')
#
ggplot(subset(tmp, MODEL=='adjusting for avg VL in batch' & COHORT=='RCCS'), aes(x=BATCH, y=OR, ymin=l95, ymax=u95)) + geom_point() + geom_errorbar() + facet_grid(~PR) + coord_flip()
ggsave(file=file.path(wdir, 'PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment_batch_oddsratio_BATCHVL.pdf'), w=7, h=10)
ggplot(subset(tmp, MODEL=='not adjusting for avg VL in batch' & COHORT=='RCCS'), aes(x=BATCH, y=OR, ymin=l95, ymax=u95)) + geom_point() + geom_errorbar() + facet_grid(~PR) + coord_flip()
ggsave(file=file.path(wdir, 'PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment_batch_oddsratio_NOBATCHVL.pdf'), w=7, h=10)
#
subset(m2r.4.or, u95<0.95)
subset(m2r.1.or, u95<0.95)
#
# batch models without BATCH VL and ignoring AC resistance
#
m2f.2b <- glm(data=subset(drs2f, COHORT!='AC_Resistance'), ASS ~ VL + COHORT + ART + NT_DIFF_2Fc + BATCH, family='binomial')
summary(m2f.2b)
m2f.2b.or<- cbind(data.table(COEF=names(coef(m2f.2b))), as.data.table( exp(cbind(OR = coef(m2f.2b), confint(m2f.2b))) ) )
setnames(m2f.2b.or, c('2.5 %','97.5 %'), c('l95','u95'))
m2r.2b <- glm(data=subset(drs2r, COHORT!='AC_Resistance'), ASS ~ VL + COHORT + ART + NT_DIFF_2Rc + BATCH, family='binomial')
summary(m2r.2b)
m2r.2b.or<- cbind(data.table(COEF=names(coef(m2r.2b))), as.data.table( exp(cbind(OR = coef(m2r.2b), confint(m2r.2b))) ) )
setnames(m2r.2b.or, c('2.5 %','97.5 %'), c('l95','u95'))
#
# Rakai model with communities
#
m2f.4 <- glm(data=drs2f, ASS ~ VL + ART + LOC + COMM_NUM + NT_DIFF_2Fc, family='binomial')
summary(m2f.4)
m2f.4.or<- cbind(data.table(COEF=names(coef(m2f.4))), as.data.table( exp(cbind(OR = coef(m2f.4), confint(m2f.4))) ) )
setnames(m2f.4.or, c('2.5 %','97.5 %'), c('l95','u95'))
subset(m2f.4.or, u95<0.95)
m2r.4 <- glm(data=drs2r, ASS ~ VL + ART + LOC + COMM_NUM + NT_DIFF_2Rc, family='binomial')
summary(m2r.4)
m2r.4.or<- cbind(data.table(COEF=names(coef(m2r.4))), as.data.table( exp(cbind(OR = coef(m2r.4), confint(m2r.4))) ) )
setnames(m2r.4.or, c('2.5 %','97.5 %'), c('l95','u95'))
subset(m2r.4.or, u95<0.95)
save(drs2f, drs2r, m2f.1, m2r.1, m2f.1.or, m2r.1.or, m2f.2, m2r.2, m2f.2.or, m2r.2.or, m2f.3, m2r.3, m2f.3.or, m2r.3.or, m2f.4, m2r.4, m2f.4.or, m2r.4.or, file=file.path(wdir, gsub('.rda','_logistic.rda',wfile)))
}
##--------------------------------------------------------------------------------------------------------
## olli 25.07.16
##--------------------------------------------------------------------------------------------------------
treecomparison.explaingaps.plots.160725<- function()
{
require(ape)
require(scales)
require(data.table)
require(Hmisc)
wdir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/explaingaps'
wfile <- 'PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment.rda'
#
# deal with repeats in global alignment
#
load(file.path(wdir, wfile))
#
# re-plot alignment with primers highlighted and all gap columns included too
#
min.coverage <- 600
min.depth <- 10
# convert into chunks
ch <- lapply(seq_len(nrow(sq)), function(i)
{
z <- gregexpr('1+', paste(as.numeric( !as.character( sq[i,] )%in%c('-','?','n') ), collapse='') )[[1]]
data.table(PANGEA_ID= rownames(sq)[i], POS=as.integer(z), DEPTH=min.depth, REP=attr(z,"match.length"))
})
ch <- do.call('rbind',ch)
# define SITE
ch[, SITE:=NA_character_]
tmp <- ch[, which(grepl('^R[0-9]+_',PANGEA_ID))]
set(ch, tmp, 'SITE', 'ZA')
tmp <- ch[, which(is.na(SITE) & grepl('PG[0-9]+-[A-Z]+',PANGEA_ID))]
set(ch, tmp, 'SITE', ch[tmp, regmatches(PANGEA_ID, regexpr('PG[0-9]+-[A-Z]+',PANGEA_ID))])
set(ch, NULL, 'SITE', ch[, gsub('PG[0-9]+-','',SITE)])
ch <- subset(ch, !is.na(SITE))
ch <- merge(ch, ch[, list(COV=sum(REP)), by='PANGEA_ID'], by='PANGEA_ID')
# select min.coverage, select min.depth
ch <- subset(ch, COV>=min.coverage & DEPTH>=min.depth)
# define chunks
ch[, POS_NEXT:= POS+REP]
ch <- ch[, list(SITE=SITE, POS=POS, DEPTH=DEPTH, REP=REP, CHUNK=cumsum(as.numeric(c(TRUE, POS[-1]!=POS_NEXT[-length(POS_NEXT)])))), by='PANGEA_ID']
ch <- ch[, list(SITE=SITE[1], POS_CH=min(POS), REP_CH=sum(REP), DEPTH_CH= sum(DEPTH*REP)/sum(REP) ), by=c('PANGEA_ID','CHUNK')]
ch[, DEPTH_MIN:=min.depth]
set(ch, NULL, 'SITE', ch[, factor(SITE, levels=c('BW', 'ZA', 'UG'), labels=c('Botswana', 'South Africa', 'Uganda'))])
ch <- merge(ch, ch[, list(COV=sum(REP_CH)), by='PANGEA_ID'], by='PANGEA_ID')
ch[, COVP:= COV/ncol(sq)]
#
require(ggplot2)
require(viridis)
ch <- merge(ch, ch[, list(POS_CHF=min(POS_CH)), by='PANGEA_ID'], by='PANGEA_ID')
setkey(ch, SITE, PANGEA_ID)
tmp <- unique(ch, by=c('SITE','PANGEA_ID'))
setkey(tmp, POS_CHF)
tmp[, PLOT:=ceiling(seq_len(nrow(tmp))/1070)]
ch <- merge(ch, subset(tmp, select=c(PANGEA_ID, PLOT)), by='PANGEA_ID')
set(ch, NULL, 'PLOT', ch[, factor(PLOT, levels=c(4,3,2,1), labels=c(4,3,2,1))])
setkey(ch, POS_CH, SITE)
set(ch, NULL, 'PANGEA_ID', ch[, factor(PANGEA_ID, levels=unique(PANGEA_ID), labels=unique(PANGEA_ID))])
dpani <- subset(dpan, !is.na(START))[, list(START=START[1], END=START[1]+max(IDX)-1L), by='PR']
ggplot(ch) +
geom_segment(aes(y=PANGEA_ID, yend=PANGEA_ID, x=POS_CH, xend=POS_CH+REP_CH-1L, colour=SITE)) +
geom_rect(data=dpani, aes(xmin=START, xmax=END, ymin=-Inf, ymax=Inf), fill="#3690C0") +
geom_text(data=dpani, aes(x=START, y=seq_len(length(START))*10+10, label=PR), colour="#3690C0", hjust=-.2, size=2) +
facet_wrap(~PLOT, scales='free_y', ncol=4) +
scale_x_continuous(expand=c(0,0), breaks=seq(0,10e3,1e3), minor_breaks=seq(0,10e3,100)) +
scale_colour_manual(values=c('Botswana'="#1B0C42FF", 'South Africa'="#CF4446FF", 'Uganda'="#781C6DFF")) +
labs(x='\nalignment position', y='PANGEA-HIV sequences\n', colour='sampling\nlocation') +
theme_bw() +
theme( axis.text.y=element_blank(), axis.ticks.y=element_blank(), axis.line.y=element_blank(), legend.position='bottom',
strip.text= element_blank(), strip.background=element_blank()) +
guides(colour=guide_legend(override.aes=list(size=5)))
ggsave(file=file.path(wdir,gsub('.rda','_gapsprimers.pdf',wfile)), w=15, h=15, limitsize = FALSE)
#
# plot Rakai samples by REGA subtype
#
setnames(ch, 'PANGEA_ID', 'TAXA')
tmp <- unique(subset(dpand, !is.na(PANGEA_ID), select=c(PANGEA_ID, TAXA, STUDY_ID, REGA_GAG_A, REGA_GAG_AS, REGA_GAG_PURE, REGA_GAG_PURES)), by=c('PANGEA_ID','TAXA'))
chr <- merge(ch, tmp, by='TAXA', all.x=1)
set(chr, chr[, which(is.na(REGA_GAG_A))], 'REGA_GAG_A', 'No matched ID')
chr <- subset(chr, !is.na(STUDY_ID))
# redefine ordering
chr[, PLOT:=NULL]
setkey(chr, REGA_GAG_A, TAXA)
tmp <- unique(chr,by=c('REGA_GAG_A','TAXA'))
setkey(tmp, REGA_GAG_A, COVP, TAXA)
tmp <- tmp[, list(TAXA=TAXA, PLOT=REGA_GAG_A, PLOT_ID=seq_along(TAXA)), by='REGA_GAG_A']
chr <- merge(chr, subset(tmp, select=c(TAXA, PLOT_ID)), by='TAXA')
set(chr, NULL, 'REGA_GAG_A', chr[, factor(REGA_GAG_A, levels=c("A (A1)", "D", "C", "Check bootscan", "D (10_CD)", "CRF 21_A2D", "G", "Sequence error", "Unassigned"))])
ggplot(subset(chr, REGA_GAG_A%in%c("A (A1)", "D", "C", "Check bootscan"))) +
geom_segment(aes(y=PLOT_ID, yend=PLOT_ID, x=POS_CH, xend=POS_CH+REP_CH-1L, colour=REGA_GAG_A)) +
geom_rect(data=dpani, aes(xmin=START, xmax=END, ymin=-Inf, ymax=Inf), fill="black") +
facet_wrap(~REGA_GAG_A, scales='free_y', ncol=4) +
scale_x_continuous(expand=c(0,0), breaks=dpani$START, labels=dpani$PR) +
scale_y_continuous(expand=c(0,0)) +
scale_colour_brewer(palette='Set1') +
labs(x='\nalignment position', y='Rakai PANGEA-HIV sequences\n', colour='Rega subtype assignment on gag') +
theme_bw() +
theme( legend.position='bottom', strip.text= element_blank(), strip.background=element_blank()) +
guides(colour=guide_legend(override.aes=list(size=5)))
ggsave(file=file.path(wdir,gsub('.rda','_gapsprimers_REGAsubtypes.pdf',wfile)), w=15, h=15, limitsize = FALSE)
#
# plot Rakai samples by COMET subtype
#
tmp <- unique(subset(dpand, !is.na(PANGEA_ID), select=c(PANGEA_ID, TAXA, STUDY_ID, COMET_Region1, COMET_Region2, COMET_Region3)), by=c('PANGEA_ID','TAXA'))
chr <- merge(ch, tmp, by='TAXA', all.x=1)
chr <- subset(chr, !is.na(STUDY_ID))
set(chr, chr[, which(is.na(COMET_Region1))], 'COMET_Region1', 'unassigned')
set(chr, chr[, which(is.na(COMET_Region2))], 'COMET_Region2', 'unassigned')
set(chr, chr[, which(is.na(COMET_Region3))], 'COMET_Region3', 'unassigned')
chr[, COMET_Region123:= paste(COMET_Region1,COMET_Region2,COMET_Region3,sep='-')]
set(chr, chr[, which(!COMET_Region123%in%c('A1-A1-A1','A2-A2-A2','B-B-B','C-C-C','D-D-D','other-other-other','unassigned-unassigned-unassigned'))], 'COMET_Region123', 'mixed')
# Region123 mixes cause and effect: must have good representation of all regions in order to call subtypes well
chr[, COMET_Region13:= paste(COMET_Region1,COMET_Region3,sep='-')]
set(chr, chr[, which(!COMET_Region13%in%c('A1-A1','A2-A2','B-B','C-C','D-D','other-other','unassigned-unassigned'))], 'COMET_Region13', 'mixed')
# not sure if this should be used..
# redefine ordering
chr[, PLOT:=NULL]
setkey(chr, COMET_Region1, TAXA)
tmp <- unique(chr, by=c('COMET_Region1','TAXA'))
setkey(tmp, COMET_Region1, COVP, TAXA)
tmp <- tmp[, list(TAXA=TAXA, PLOT=COMET_Region1, PLOT_ID=seq_along(TAXA)), by='COMET_Region1']
chr <- merge(chr, subset(tmp, select=c(TAXA, PLOT_ID)), by='TAXA')
ggplot(subset(chr, !COMET_Region1%in%c('check','other'))) +
geom_segment(aes(y=PLOT_ID, yend=PLOT_ID, x=POS_CH, xend=POS_CH+REP_CH-1L, colour=COMET_Region1)) +
geom_rect(data=dpani, aes(xmin=START, xmax=END, ymin=-Inf, ymax=Inf), fill="black") +
geom_vline(xintercept=c(2200,3000)) +
facet_wrap(~COMET_Region1, scales='free_y', ncol=4) +
scale_x_continuous(expand=c(0,0), breaks=dpani$START, labels=dpani$PR) +
scale_y_continuous(expand=c(0,0)) +
scale_colour_brewer(palette='Set1') +
labs(x='\nalignment position', y='Rakai PANGEA-HIV sequences\n', colour='COMET subtype assignment\non region 1') +
theme_bw() +
theme( legend.position='bottom', strip.text= element_blank(), strip.background=element_blank()) +
guides(colour=guide_legend(override.aes=list(size=5)))
ggsave(file=file.path(wdir,gsub('.rda','_gapsprimers_COMETsubtypes.pdf',wfile)), w=15, h=10, limitsize = FALSE)
#
# plot Rakai samples by Sanger processing batch
#
tmp <- unique(subset(dpand, !is.na(PANGEA_ID), select=c(PANGEA_ID, TAXA, STUDY_ID, SANGER_ID)), by=c('PANGEA_ID','TAXA','SANGER_ID'))
chr <- merge(ch, tmp, by='TAXA', all.x=1)
#chr <- subset(chr, !is.na(STUDY_ID))
chr[, BATCH:=NA_character_]
tmp <- chr[, which(!is.na(SANGER_ID))]
set(chr, tmp, 'BATCH', chr[tmp, regmatches(SANGER_ID,regexpr('^[0-9]+', SANGER_ID))])
set(chr, chr[, which(is.na(BATCH))], 'BATCH', 'No matched ID')
# redefine ordering
chr[, PLOT:=NULL]
setkey(chr, BATCH, TAXA)
tmp <- unique(chr, by=c('BATCH','TAXA'))
setkey(tmp, BATCH, COVP, TAXA)
tmp <- tmp[, list(TAXA=TAXA, PLOT=BATCH, PLOT_ID=seq_along(TAXA)), by='BATCH']
chr <- merge(chr, subset(tmp, select=c(TAXA, PLOT_ID)), by='TAXA')
ggplot(chr) +
geom_segment(aes(y=PLOT_ID, yend=PLOT_ID, x=POS_CH, xend=POS_CH+REP_CH-1L, colour=factor(is.na(STUDY_ID), levels=c(TRUE,FALSE), labels=c('OTHER','Rakai')))) +
geom_rect(data=dpani, aes(xmin=START, xmax=END, ymin=-Inf, ymax=Inf), fill="black") +
facet_wrap(~BATCH, scales='free_y', ncol=4) +
scale_x_continuous(expand=c(0,0), breaks=dpani$START, labels=dpani$PR) +
scale_y_continuous(expand=c(0,0)) +
#scale_colour_brewer(palette='Set1') +
labs(x='\nalignment position', y='Rakai PANGEA-HIV sequences\n', colour='Cohort site') +
theme_bw() +
theme( legend.position='bottom') +
guides(colour=guide_legend(override.aes=list(size=5)))
ggsave(file=file.path(wdir,gsub('.rda','_gapsprimers_Batch.pdf',wfile)), w=15, h=100, limitsize = FALSE)
#
# plot Rakai samples by ART
#
# redefine ordering
tmp <- unique(subset(dpand, !is.na(PANGEA_ID), select=c(PANGEA_ID, TAXA, STUDY_ID, SAMPLEDATE, ARTSTART, selfReportArt, everSelfReportArt, RECENTVL)), by=c('PANGEA_ID','TAXA'))
tmp[, ART:= as.integer(ARTSTART<SAMPLEDATE)]
set(tmp, tmp[, which(is.na(ART))], 'ART', 0L)
set(tmp, tmp[, which(ART==0 & everSelfReportArt==1)], 'ART', 2L)
set(tmp, tmp[, which(ART==0 & RECENTVL<1e4)], 'ART', 3L)
set(tmp, NULL, 'ART', tmp[, factor(ART, levels=c(0L,1L,2L,3L), labels=c('no ART', 'ART started', 'ART self reported','no ART but VL<1e4'))])
chr <- merge(ch, tmp, by='TAXA', all.x=1)
set(chr, chr[, which(is.na(ART))], 'ART', 'No matched ID')
chr <- subset(chr, !is.na(STUDY_ID))
# redefine ordering
chr[, PLOT:=NULL]
setkey(chr, TAXA)
tmp <- unique(chr, by='TAXA')
setkey(tmp, ART, COVP, TAXA)
tmp <- tmp[, list(TAXA=TAXA, PLOT=ART, PLOT_ID=seq_along(TAXA)), by='ART']
chr <- merge(chr, subset(tmp, select=c(TAXA, PLOT_ID)), by='TAXA')
#set(chr, NULL, 'REGA_GAG_A', chr[, factor(REGA_GAG_A, levels=c("A (A1)", "D", "C", "Check bootscan", "D (10_CD)", "CRF 21_A2D", "G", "Sequence error", "Unassigned"))])
ggplot(chr) +
geom_segment(aes(y=PLOT_ID, yend=PLOT_ID, x=POS_CH, xend=POS_CH+REP_CH-1L, colour=ART)) +
geom_rect(data=dpani, aes(xmin=START, xmax=END, ymin=-Inf, ymax=Inf), fill="black") +
facet_wrap(~ART, scales='free_y', ncol=4) +
scale_x_continuous(expand=c(0,0), breaks=dpani$START, labels=dpani$PR) +
scale_y_continuous(expand=c(0,0)) +
scale_colour_brewer(palette='Set2') +
labs(x='\nalignment position', y='Rakai PANGEA-HIV sequences\n', colour='ART status') +
theme_bw() +
theme( legend.position='bottom', strip.text= element_blank(), strip.background=element_blank()) +
guides(colour=guide_legend(override.aes=list(size=5)))
ggsave(file=file.path(wdir,gsub('.rda','_gapsprimers_ARTstatus.pdf',wfile)), w=15, h=15, limitsize = FALSE)
#
# plot Rakai samples by ART
#
# redefine ordering
tmp <- unique(subset(dpand, !is.na(PANGEA_ID), select=c(PANGEA_ID, TAXA, STUDY_ID, SAMPLEDATE, ARTSTART, selfReportArt, everSelfReportArt, RECENTVL)), by=c('PANGEA_ID','TAXA'))
tmp[, VL:= cut(RECENTVL, breaks=c(0, 1e4, 2e4, 4e4, 1e5, Inf), labels=c('<1e4','1e4-2e4','2e4-4e4','4e4-1e5','>1e5'))]
set(tmp, tmp[, which(is.na(VL))], 'VL', 'No VL measured')
chr <- merge(ch, tmp, by='TAXA', all.x=1)
set(chr, chr[, which(is.na(VL))], 'VL', 'No matched ID')
chr <- subset(chr, !is.na(STUDY_ID))
chr[, PLOT:=NULL]
setkey(chr, TAXA)
tmp <- unique(chr, by='TAXA')
setkey(tmp, VL, COVP, TAXA)
tmp <- tmp[, list(TAXA=TAXA, PLOT=VL, PLOT_ID=seq_along(TAXA)), by='VL']
chr <- merge(chr, subset(tmp, select=c(TAXA, PLOT_ID)), by='TAXA')
ggplot(chr) +
geom_segment(aes(y=PLOT_ID, yend=PLOT_ID, x=POS_CH, xend=POS_CH+REP_CH-1L, colour=VL)) +
geom_rect(data=dpani, aes(xmin=START, xmax=END, ymin=-Inf, ymax=Inf), fill="black") +
facet_wrap(~VL, scales='free_y', ncol=6) +
scale_x_continuous(expand=c(0,0), breaks=dpani$START, labels=dpani$PR) +
scale_y_continuous(expand=c(0,0)) +
scale_colour_brewer(palette='Dark2') +
labs(x='\nalignment position', y='Rakai PANGEA-HIV sequences\n', colour='Viral load status') +
theme_bw() +
theme( legend.position='bottom', strip.text= element_blank(), strip.background=element_blank()) +
guides(colour=guide_legend(override.aes=list(size=5)))
ggsave(file=file.path(wdir,gsub('.rda','_gapsprimers_VLstatus.pdf',wfile)), h=7, w=20, limitsize = FALSE)
#
# plot Rakai samples by region
#
# redefine ordering
tmp <- unique(subset(dpand, !is.na(PANGEA_ID), select=c(PANGEA_ID, TAXA, STUDY_ID, LOC)), by=c('PANGEA_ID','TAXA'))
set(tmp, NULL, 'LOC', tmp[, factor(LOC)])
chr <- merge(ch, tmp, by='TAXA', all.x=1)
chr <- subset(chr, !is.na(STUDY_ID))
chr[, PLOT:=NULL]
setkey(chr, TAXA)
tmp <- unique(chr, by='TAXA')
setkey(tmp, LOC, COVP, TAXA)
tmp <- tmp[, list(TAXA=TAXA, PLOT=LOC, PLOT_ID=seq_along(TAXA)), by='LOC']
chr <- merge(chr, subset(tmp, select=c(TAXA, PLOT_ID)), by='TAXA')
ggplot(chr) +
geom_segment(aes(y=PLOT_ID, yend=PLOT_ID, x=POS_CH, xend=POS_CH+REP_CH-1L, colour=LOC)) +
geom_rect(data=dpani, aes(xmin=START, xmax=END, ymin=-Inf, ymax=Inf), fill="black") +
facet_wrap(~LOC, scales='free_y', ncol=6) +
scale_x_continuous(expand=c(0,0), breaks=dpani$START, labels=dpani$PR) +
scale_y_continuous(expand=c(0,0)) +
scale_colour_brewer(palette='Dark2') +
labs(x='\nalignment position', y='Rakai PANGEA-HIV sequences\n', colour='region') +
theme_bw() +
theme( legend.position='bottom', strip.text= element_blank(), strip.background=element_blank()) +
guides(colour=guide_legend(override.aes=list(size=5)))
ggsave(file=file.path(wdir,gsub('.rda','_gapsprimers_REGION.pdf',wfile)), h=7, w=20, limitsize = FALSE)
}
##--------------------------------------------------------------------------------------------------------
## olli 25.07.16
##--------------------------------------------------------------------------------------------------------
treecomparison.explaingaps.countgaps<- function()
{
require(ape)
require(data.table)
require(big.phylo)
wdir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/explaingaps'
wfile <- 'PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment.rda'
#
load(file.path(wdir,wfile))
#
# merge MRC Uganda
#
set(dm, dm[, which(grepl('MRC',COHORT))], 'COHORT', 'UG-MRC')
#
# get missing summaries UNASS_RAW: between sequence and reference, UNASS: in global alignment
#
tmp <- merge(dgd, subset(dm, select=c(TAXA,COHORT)),by='TAXA')
tmp2 <- merge(subset(dgd.seqs, select=c(TAXA,SANGER_ID,GENE,LEN_RAW,UNASS_RAW)), subset(dm, select=c(TAXA)),by='TAXA')
tmp <- merge(tmp, tmp2, by=c('TAXA','GENE'), all.x=1)
tmp[, LEN:=END-START+1L]
tmp[, UNASS_N:= LEN*UNASS]
tmp[, UNASS_RAW_N:= LEN_RAW*UNASS_RAW]
tmp <- unique(tmp, by=c('TAXA','GENE','COHORT'))
stopifnot( nrow(subset(tmp, !is.na(UNASS_RAW) & UNASS_RAW_N>=(UNASS_N+1)))==0 )
#
# how many more missing chars because of alignment expansion?
#
subset(tmp, GENE=='GAG+POL+ENV')[, list(EXTRA_P= mean(UNASS_N-UNASS_RAW_N)/LEN[1], EXTRA_N= mean(UNASS_N-UNASS_RAW_N)), by='COHORT']
# COHORT EXTRA
#1: BW-Mochudi 0.0154979007
#2: UG-MRC 0.0280886145
#3: RCCS 0.0515666397
#4: AC_Resistance 0.0005832984
#5: Uganda 0.0469642640
ggplot(subset(tmp, GENE=='GAG+POL+ENV'), aes(x=UNASS_RAW_N, y=UNASS_N)) +
geom_point(alpha=0.25, size=1.5) + geom_abline(slope=1, intercept=0, linetype='dotted') +
scale_x_continuous(expand=c(0,0), breaks=seq(0,1e4,1e3)) +
scale_y_continuous(expand=c(0,0), breaks=seq(0,1e4,1e3)) +
theme_bw() + labs(x='\nnumber of missing characters\nrelative to reference sequence used to assemble reads', y='number of missing characters\nin sequence alignment\n')
ggsave(file.path('~/Dropbox (SPH Imperial College)/2016_PANGEA_treecomp/figures','PANGEA_extra_gaps_in_alignment.pdf'), w=6, h=6)
lm(data=subset(tmp, GENE=='GAG+POL+ENV'), UNASS_N~UNASS_RAW_N-1)
#Coefficients:
# UNASS_RAW_N
# 1.109
# add the odd RCCS sequence run
#tmp <- merge(tmp, subset(dm, select=c(TAXA, SANGER_ID)), by='TAXA')
#tmp[, SANGER_PLATE:= as.integer(regmatches(SANGER_ID, regexpr('^[0-9]+', SANGER_ID)))]
#tmp2 <- subset(tmp, SANGER_PLATE>=15892 & SANGER_PLATE<=15964)
#tmp2[, COHORT:='RCCS_run_odd_plates']
#tmp <- rbind(tmp, tmp2)
#tmp2 <- subset(tmp, SANGER_PLATE<15892 | SANGER_PLATE>15964)
#tmp2[, COHORT:='RCCS_other_plates']
#tmp <- rbind(tmp, tmp2)
# add all Uganda sequences
tmp2 <- subset(tmp, COHORT=='UG-MRC'|COHORT=='RCCS')
tmp2[, COHORT:='Uganda']
tmp <- rbind(tmp, tmp2)
# look at distribution of gaps by cohort
dgi <- subset(tmp, GENE=='GAG+POL+ENV')[, list(P=ecdf(1-UNASS_RAW)(seq(0,1,.01)), PC_ASS=seq(0,1,.01)), by=c('COHORT','GENE','LEN')]
subset(dgi, PC_ASS==0.8)
#> subset(dgi, PC_ASS==0.8)
#COHORT GENE LEN P PC_ASS
#1: BW-Mochudi GAG+POL+ENV 8926 0.2562674 0.8
#2: UG-MRC GAG+POL+ENV 8926 0.4051355 0.8
#3: RCCS GAG+POL+ENV 8926 0.7871514 0.8
#4: AC_Resistance GAG+POL+ENV 8926 0.0000000 0.8
# RCCS GAG+POL+ENV 8926 0.2392074 0.2
dgi <- subset(tmp, GENE=='GAG+POL+ENV' & COHORT%in%c('BW-Mochudi','UG-MRC','RCCS','AC_Resistance'))[, list(P=ecdf(1-UNASS_RAW)(seq(0,1,.01)), PC_ASS=seq(0,1,.01))]
# 1: 0.206 0.2
#
# make histogram
tmp2 <- subset(tmp, GENE=='GAG+POL+ENV' & COHORT%in%c('BW-Mochudi','UG-MRC','RCCS','AC_Resistance'))
set(tmp2, NULL, 'COHORT', tmp2[, factor(COHORT, levels=c('RCCS','BW-Mochudi','UG-MRC','AC_Resistance'),
labels=c('Rakai Community Cohort Study', 'Mochudi Prevention Project', 'Uganda-MRC', 'South Africa Resistance Cohort'))])
tmp2[, ASS_RAW_N:=LEN_RAW-UNASS_RAW_N]
tmp2[, ASS_N:=LEN-UNASS_N]
tmp2 <- tmp2[, list(P_UNASS=seq(0,1,0.01), CUM= length(UNASS)*ecdf(UNASS)(seq(0,1,0.01)), CUM_RAW=length(UNASS)*ecdf(UNASS_RAW)(seq(0,1,0.01)) ), by='COHORT']
ggplot(tmp2, aes(x=P_UNASS, fill=COHORT, y=CUM_RAW)) +
geom_bar(stat='identity', position='stack') +
scale_x_continuous(labels=percent, expand=c(0,0), limit=c(-0.01,1.01), breaks=seq(0, 1, 0.2)) +
scale_y_continuous(expand=c(0,0), limits=c(0,4000))+
scale_fill_manual(values=c('Mochudi Prevention Project'="#33638DFF", 'South Africa Resistance Cohort'="#CF4446FF", 'Rakai Community Cohort Study'="#A8327DFF", 'Uganda-MRC'="#29AF7FFF")) +
theme_bw() + theme(legend.position='bottom') +
guides(fill=guide_legend(ncol=2)) +
labs( x='\nx% = proportion of unassembled sites per sequence is at most x%',
y='PANGEA-HIV sequences (cumulated)\n',
fill='cohort site')
ggsave(file.path('~/Dropbox (SPH Imperial College)/2016_PANGEA_treecomp/figures','PANGEA_cumulated_by_gaps.pdf'), w=7, h=7)
#
# summary of gaps in alignment
#
dgi <- tmp[, list(ASS= c(mean(1-UNASS), quantile(1-UNASS, prob=c(0.25,.5,.75))), STAT=c('mean',paste('qu',c(0.25,.5,.75),sep='')) ), by=c('COHORT','GENE','LEN')]
dgi[, P:= 100*round(ASS, d=2)]
dgi[, N:= round(ASS*LEN,d=0)]
dgi[, LABEL:= paste(N,'/',LEN,' (',P,'%)',sep='')]
#dcast.data.table(subset(dgi, STAT=='mean'), GENE~COHORT, value.var='LABEL')
dgi <- dcast.data.table(subset(dgi, STAT=='mean'), GENE+LEN~COHORT, value.var='P')
write.csv(dgi, row.names=FALSE, file=file.path(wdir,'PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment_countgaps.csv'))
#
# summary of gaps per sequence
#
dgs <- subset(tmp, !is.na(UNASS_RAW))[, list(STAT=c('ASS_P','LEN'), V=c(100*round(mean(1-UNASS_RAW),d=2), round(mean(LEN_RAW),d=1)) ), by=c('COHORT','GENE')]
dgs <- dcast.data.table(dgs, GENE~STAT+COHORT, value.var='V')
write.csv(dgs, row.names=FALSE, file=file.path(wdir,'PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment_countgaps_raw.csv'))
#
# save
#
save(dgi, dgs, file=file.path(wdir,'PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment_countgaps.rda'))
}
##--------------------------------------------------------------------------------------------------------
## olli 25.07.16
##--------------------------------------------------------------------------------------------------------
treecomparison.explaingaps.collect.data<- function()
{
require(ape)
require(data.table)
require(big.phylo)
wdir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/explaingaps'
wfile <- 'PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment.rda'
#
# deal with repeats in global alignment
#
if(0)
{
sq <- read.dna("~/Dropbox (SPH Imperial College)/PANGEA_data/PANGEAconsensuses_2015-09_Imperial/PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment.fasta", format='fasta')
sqi <- data.table(TAXA=rownames(sq), DUMMY=seq_len(nrow(sq)))
tmp <- sqi[, which(duplicated(TAXA))]
set(sqi, tmp, 'TAXA', sqi[tmp, paste(TAXA,'-R2',sep='')])
setkey(sqi, DUMMY)
rownames(sq) <- sqi[,TAXA]
tmp <- sapply(seq_len(nrow(sq)), function(i) base.freq(sq[i,], all=TRUE, freq=TRUE))
sqi[, COV:=ncol(sq)-apply( tmp[c('-','?'),], 2, sum )]
sqi[, PNG:= sqi[, factor(grepl('PG',TAXA),levels=c(TRUE,FALSE),labels=c('Y','N'))]]
sqi[, SITE:= NA_character_]
tmp <- sqi[, which(PNG=='Y')]
set(sqi, tmp, 'SITE', sqi[tmp, substring(sapply(strsplit(TAXA,'-'),'[[',2),1,2)])
sqi[, PANGEA_ID:= gsub('-R[0-9]+','',TAXA)]
# get gap-column free alignment
sqp <- sq[subset(sqi, PNG=='Y' | grepl('HXB2', TAXA))[, TAXA],]
sqp <- seq.rmgaps(sqp, rm.only.col.gaps=1, verbose=1, rm.char=c('-','?'))
# save
write.dna(sq, file=file.path(wdir,'PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment.fasta'), format='fa')
save(sq, sqp, sqi, file=file.path(wdir,wfile))
}
#
# find primer coordinates and gene coordinates in PANGEA alignment
#
if(0)
{
load(file.path(wdir,wfile))
#
pan1f <- 'agcc.gggagctctctg'
pan1r <- 'cctccaattcc.cctatcatttt'
pan2f <- 'gggaagtga.atagc.ggaac'
pan2r <- 'ctgccatctgttttccata.tc'
pan3f <- 'ttaaaagaaaaggggggattggg'
pan3r <- 'tggc.ytgtaccgtcagcg'
pan4f <- 'cctatggcaggaagaagcg'
pan4r <- 'ctt.tatgcag..tctgaggg' #I had to remove one nucleotide here it is: CW ( c. ) instead of (..)
# get the forward code of the reverse primers
dpan <- rbind( data.table(PR='1R', SEQR=strsplit(pan1r,'')[[1]], IDXR=seq_len(nchar(pan1r)), IDX=rev(seq_len(nchar(pan1r))) ),
data.table(PR='2R', SEQR=strsplit(pan2r,'')[[1]], IDXR=seq_len(nchar(pan2r)), IDX=rev(seq_len(nchar(pan2r)))),
data.table(PR='3R', SEQR=strsplit(pan3r,'')[[1]], IDXR=seq_len(nchar(pan3r)), IDX=rev(seq_len(nchar(pan3r)))),
data.table(PR='4R', SEQR=strsplit(pan4r,'')[[1]], IDXR=seq_len(nchar(pan4r)), IDX=rev(seq_len(nchar(pan4r)))) )
dpan <- merge(dpan, data.table(SEQR=c('a','t','c','g','.'), SEQ=c('t','a','g','c','.')), by='SEQR')
setkey(dpan, PR, IDX)
# add the forward primers
tmp <- rbind( data.table(PR='1F', SEQ=strsplit(pan1f,'')[[1]], IDX=seq_len(nchar(pan1f)) ),
data.table(PR='2F', SEQ=strsplit(pan2f,'')[[1]], IDX=seq_len(nchar(pan2f)) ),
data.table(PR='3F', SEQ=strsplit(pan3f,'')[[1]], IDX=seq_len(nchar(pan3f)) ),
data.table(PR='4F', SEQ=strsplit(pan4f,'')[[1]], IDX=seq_len(nchar(pan4f)) ) )
dpan <- rbind( tmp, dpan, use.names=TRUE, fill=TRUE)
# check primers exist in HXB2
load("~/git/hivclust/pkg/data/refseq_hiv1_hxb2.rda")
hxb2 <- subset(hxb2, !is.na(HXB2.Position))[, paste(as.character(HXB2.K03455),collapse='')]
tmp <- dpan[, list(START=unlist(gregexpr(paste(SEQ, collapse=''),hxb2))), by='PR']
stopifnot( tmp[, all(START>1)] )
# build region table
tmp <- dcast.data.table(dpan[, list(SEQ=paste(SEQ,collapse='')), by='PR'], .~PR, value.var='SEQ')
dgenec <- rbind( data.table(PRIMER='N', GENE='GAG', LOC='start', SEQ=strsplit("atgggtgcgagagcgtcagtatt",'')[[1]]),
data.table(PRIMER='N', GENE='GAG', LOC='end', SEQ=strsplit("aacgacccctcgtcacaataa",'')[[1]]),
data.table(PRIMER='N', GENE='POL', LOC='start', SEQ=strsplit("cctcaggtcactctttggca",'')[[1]]),
data.table(PRIMER='N', GENE='POL', LOC='end', SEQ=strsplit("gtagacaggatgaggattag",'')[[1]]),
data.table(PRIMER='N', GENE='ENV', LOC='start', SEQ=strsplit("atgagagtgaaggagaaatatcag",'')[[1]]),
data.table(PRIMER='N', GENE='ENV', LOC='end', SEQ=strsplit("cttggaaaggattttgctataa",'')[[1]]),
data.table(PRIMER='N', GENE='GAG+POL+ENV', LOC='start', SEQ=strsplit("atgggtgcgagagcgtcagtatt",'')[[1]]),
data.table(PRIMER='N', GENE='GAG+POL+ENV', LOC='end', SEQ=strsplit("cttggaaaggattttgctataa",'')[[1]]),
data.table(PRIMER='Y', GENE='START-2F', LOC='start', SEQ=strsplit("atgggtgcgagagcgtcagtatt",'')[[1]]),
data.table(PRIMER='Y', GENE='START-2F', LOC='end', SEQ=strsplit(tmp[['2F']],'')[[1]]),
data.table(PRIMER='Y', GENE='START-1R', LOC='start', SEQ=strsplit("atgggtgcgagagcgtcagtatt",'')[[1]]),
data.table(PRIMER='Y', GENE='START-1R', LOC='end', SEQ=strsplit(tmp[['1R']],'')[[1]]),
data.table(PRIMER='Y', GENE='2F-1R', LOC='start', SEQ=strsplit(tmp[['2F']],'')[[1]]),
data.table(PRIMER='Y', GENE='2F-1R', LOC='end', SEQ=strsplit(tmp[['1R']],'')[[1]]),
data.table(PRIMER='Y', GENE='1R-3F', LOC='start', SEQ=strsplit(tmp[['1R']],'')[[1]]),
data.table(PRIMER='Y', GENE='1R-3F', LOC='end', SEQ=strsplit(tmp[['3F']],'')[[1]]),
data.table(PRIMER='Y', GENE='3F-2R', LOC='start', SEQ=strsplit(tmp[['3F']],'')[[1]]),
data.table(PRIMER='Y', GENE='3F-2R', LOC='end', SEQ=strsplit(tmp[['2R']],'')[[1]]),
data.table(PRIMER='Y', GENE='3F-4F', LOC='start', SEQ=strsplit(tmp[['3F']],'')[[1]]),
data.table(PRIMER='Y', GENE='3F-4F', LOC='end', SEQ=strsplit(tmp[['4F']],'')[[1]]),
data.table(PRIMER='Y', GENE='2R-4F', LOC='start', SEQ=strsplit(tmp[['2R']],'')[[1]]),
data.table(PRIMER='Y', GENE='2R-4F', LOC='end', SEQ=strsplit(tmp[['4F']],'')[[1]]),
data.table(PRIMER='Y', GENE='4F-3R', LOC='start', SEQ=strsplit(tmp[['4F']],'')[[1]]),
data.table(PRIMER='Y', GENE='4F-3R', LOC='end', SEQ=strsplit(tmp[['3R']],'')[[1]]),
data.table(PRIMER='Y', GENE='3R-END', LOC='start', SEQ=strsplit(tmp[['3R']],'')[[1]]),
data.table(PRIMER='Y', GENE='3R-END', LOC='end', SEQ=strsplit("cttggaaaggattttgctataa",'')[[1]])
)
# find positions in alignment
tmp <- which(grepl('HXB2',rownames(sqp)))
sqhxb2 <- paste(as.character(sqp[tmp,]), collapse='')
tmp <- dgenec[, list(START=unlist(gregexpr(paste(SEQ, collapse='-*'),sqhxb2)), LEN=length(SEQ)), by=c('PRIMER','GENE','LOC')]
tmp2 <- tmp[, which(grepl('GAG|ENV|POL',GENE) & LOC=='end')]
set(tmp, tmp2, 'START', tmp[tmp2, START+LEN-1L])
tmp2 <- tmp[, which(!grepl('GAG|ENV|POL',GENE) & LOC=='end')]
set(tmp, tmp2, 'START', tmp[tmp2, START-1L])
tmp2 <- tmp[, which(!grepl('GAG|ENV|POL',GENE) & LOC=='start')]
set(tmp, tmp2, 'START', tmp[tmp2, START+LEN])
dgene <- dcast.data.table(tmp, PRIMER+GENE~LOC, value.var='START')
setnames(dgene, colnames(dgene), toupper(colnames(dgene)))
# add half regions
tmp <- subset(dgene, PRIMER=='Y')
set(tmp, NULL, 'GENE', tmp[, paste(GENE,'-firsthalf',sep='')])
set(tmp, NULL, 'END', tmp[, START+(END-START)/2])
dgene <- rbind(dgene, tmp)
set(tmp, NULL, 'GENE', tmp[, gsub('-firsthalf','-secondhalf',GENE)])
set(tmp, NULL, 'END', tmp[, START+(END-START)*2])
set(tmp, NULL, 'START', tmp[, START+(END-START)/2])
dgene <- rbind(dgene, tmp)
set(dgene, NULL, 'START', dgene[,round(START,d=0)])
set(dgene, NULL, 'END', dgene[,round(END,d=0)])
#
dgene[, LEN:=END-START+1L]
set(dgene, NULL, 'GENE', dgene[, factor(GENE, levels=c( "GAG+POL+ENV","GAG","POL","ENV",
"START-2F","START-1R","2F-1R","1R-3F","3F-2R","2R-4F","3F-4F","4F-3R","3R-END",
"START-2F-firsthalf","START-1R-firsthalf","2F-1R-firsthalf","1R-3F-firsthalf","3F-2R-firsthalf","3F-4F-firsthalf","2R-4F-firsthalf","4F-3R-firsthalf","3R-END-firsthalf",
"START-2F-secondhalf","START-1R-secondhalf","2F-1R-secondhalf","1R-3F-secondhalf","3F-2R-secondhalf","3F-4F-secondhalf","2R-4F-secondhalf","4F-3R-secondhalf","3R-END-secondhalf"))])
setkey(dgene, GENE)
set(dgene, NULL, 'PRIMER', NULL)
#
tmp <- dpan[, list(START=unlist(gregexpr(paste(SEQ, collapse='-*'),sqhxb2))), by=c('PR')]
dpan <- merge(dpan, subset(tmp, START>0), by='PR',all.x=1)
#
# add non-ambiguous primers for 1R 2F 2R 3F (none) 4F (none)
# pan1r <- 'cctccaattccYcctatcatttt'. 'y = c or t'. So in forward sense: G or A
# pan2f <- 'gggaagtgaYatagcWggaac'. 'W= a or t'.
# pan2r <- 'ctgccatctgttttccataRtc'. 'R= a or g'. So in forward sense: T or C
tmp <- subset(dpan, PR=='1R')
tmp[, PR:='1Ra']
set(tmp, 12L, 'SEQR', 'c')
set(tmp, 12L, 'SEQ', 'g')
dpan <- rbind(dpan, tmp)
tmp[, PR:='1Rb']
set(tmp, 12L, 'SEQR', 't')
set(tmp, 12L, 'SEQ', 'a')
dpan <- rbind(dpan, tmp)
tmp <- subset(dpan, PR=='2F')
tmp[, PR:='2Fa']
set(tmp, 10L, 'SEQ', 'c')
set(tmp, 16L, 'SEQ', 'a')
dpan <- rbind(dpan, tmp)
tmp[, PR:='2Fb']
set(tmp, 10L, 'SEQ', 't')
set(tmp, 16L, 'SEQ', 't')
dpan <- rbind(dpan, tmp)
tmp <- subset(dpan, PR=='2R')
tmp[, PR:='2Ra']
set(tmp, 3L, 'SEQR', 'a')
set(tmp, 3L, 'SEQ', 't')
dpan <- rbind(dpan, tmp)
tmp <- subset(dpan, PR=='2R')
tmp[, PR:='2Rb']
set(tmp, 3L, 'SEQR', 'g')
set(tmp, 3L, 'SEQ', 'c')
dpan <- rbind(dpan, tmp)
#
save(sq, sqp, sqi, dgene, dgenec, dpan, file=file.path(wdir,wfile))
# extract START to 1R alignment
write.dna(sqp[, 1:(1893+23)], file=file.path(wdir, paste(gsub('.rda','',wfile),'_region_1F1R.fasta', sep='')), format='fa', colsep='', nbcol=-1)
write.dna(sqp[, 4570:5573], file=file.path(wdir, paste(gsub('.rda','',wfile),'_region_3F4F.fasta', sep='')), format='fa', colsep='', nbcol=-1)
write.dna(sqp[, 5555:8903], file=file.path(wdir, paste(gsub('.rda','',wfile),'_region_4F3R.fasta', sep='')), format='fa', colsep='', nbcol=-1)
write.dna(sqp[ subset(dgd, GENE=='GAG+POL+ENV' & grepl('UG',TAXA) & LEN*(1-UNASS)>8000)[, TAXA], ], file=file.path(wdir, paste(gsub('.rda','',wfile),'_UGfull.fasta', sep='')), format='fa', colsep='', nbcol=-1)
write.dna(sqp[ subset(dgd, GENE=='GAG+POL+ENV' & grepl('ZA|BW',TAXA) & LEN*(1-UNASS)>8000)[, TAXA], ], file=file.path(wdir, paste(gsub('.rda','',wfile),'_ZABWfull.fasta', sep='')), format='fa', colsep='', nbcol=-1)
write.dna(sqp[, 5555:8903], file=file.path(wdir, paste(gsub('.rda','',wfile),'_region_4F3R.fasta', sep='')), format='fa', colsep='', nbcol=-1)
# extract primer alignments and write to file
subset(dpan, !is.na(START))[, {
write.dna(sqp[, seq.int(START[1], len=length(START))], file=file.path(wdir, paste(gsub('.rda','',wfile),'_primer_',PR[1],'_start_',START[1],'.fasta', sep='')), format='fa', colsep='', nbcol=-1)
}, by='PR']
# extract primer alignments and write to file +- 75 bp
subset(dpan, !is.na(START))[, {
write.dna(sqp[, seq.int(START[1]-75L, len=length(START)+75L)], file=file.path(wdir, paste(gsub('.rda','',wfile),'_primerplusminus75bp_',PR[1],'_start_',START[1],'.fasta', sep='')), format='fa', colsep='', nbcol=-1)
}, by='PR']
}
#
# map new IDs to old IDs
#
if(0)
{
infile.new <- '~/Dropbox (SPH Imperial College)/PANGEA_data/PANGEAconsensuses_2015-09_Imperial/PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment_NewIDs_161212.fasta'
sn <- read.dna(infile.new, format='fa')
# check seqences are in order
tmp <- sapply(seq_len(nrow(sq)), function(i) dist.dna(rbind(sq[i,],sn[i,]), model='raw') )
stopifnot(!any(tmp>0))
# map IDs
sqi[, PANGEA_ID2:= rownames(sn)]
#
save(sq, sqp, sqi, dgene, dgenec, dpan, file=file.path(wdir,wfile))
}
#
# calculate number of gaps in gene regions without expanding gaps in alignment
#
if(0)
{
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/missingchar_alignments'
#infiles <- data.table(F=list.files(indir, pattern='fasta$',full.names=TRUE))
#tmp <- infiles[,{
# #F <- '/Users/Oliver/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/missingchar_alignments/15097_1_64_MinCov_10_50_wHXB2.fasta'
# cat('\n',F)
# ss <- read.dna(F, format='fasta')
# list(NCOL=ncol(ss))
# },by='F']
#write.csv(tmp, file='~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/missingchar_alignments/checklen.csv')
dgd.seqs<- infiles[,{
#F <- '/Users/Oliver/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/missingchar_alignments/15097_1_64_MinCov_10_50_wHXB2.fasta'
cat('\n',F)
ss <- read.dna(F, format='fasta')
# find positions in alignment of current SANGER ID
sqhxb2 <- paste(as.character(ss[grepl('^B\\.FR\\.83\\.HXB2',rownames(ss)),]), collapse='')
tmp <- dgenec[, list(START=unlist(gregexpr(paste(SEQ, collapse='-*'),sqhxb2)), LEN=length(SEQ)), by=c('PRIMER','GENE','LOC')]
tmp2 <- tmp[, which(grepl('GAG|ENV|POL',GENE) & LOC=='end')]
set(tmp, tmp2, 'START', tmp[tmp2, START+LEN-1L])
tmp2 <- tmp[, which(!grepl('GAG|ENV|POL',GENE) & LOC=='end')]
set(tmp, tmp2, 'START', tmp[tmp2, START-1L])
tmp2 <- tmp[, which(!grepl('GAG|ENV|POL',GENE) & LOC=='start')]
set(tmp, tmp2, 'START', tmp[tmp2, START+LEN])
dgene2 <- dcast.data.table(tmp, PRIMER+GENE~LOC, value.var='START')
setnames(dgene2, colnames(dgene2), toupper(colnames(dgene2)))
ans <- dgene2[, {
#print(GENE)
#START<- 1506; END<-2384
tmp <- ss[!grepl('^B\\.FR\\.83\\.HXB2',rownames(ss)), START:END]
tmp <- seq.rmgaps(tmp, rm.only.col.gaps=1, rm.char=c('-','?'), verbose=0)
list( LEN_RAW= ncol(tmp),
UNASS_RAW=mean( as.character( tmp[grepl('^[0-9]+_[0-9]_[0-9]+',rownames(tmp)),] )=='?' ))
}, by='GENE']
ans[, SID:= gsub('^([0-9]+_[0-9]_[0-9]+)_.*','\\1',rownames(ss)[grepl('^[0-9]+_[0-9]_[0-9]+',rownames(ss))])]
ans
},by='F']
dgd.seqs[, SANGER_ID:= gsub('^([0-9]+_[0-9]+_[0-9]+)_.*','\\1',basename(F))]
stopifnot( nrow(subset(dgd.seqs, is.na(SANGER_ID)))==0 )
dgd.seqs[, F:=NULL]
dgd.seqs[, SID:=NULL]
save(sq, sqp, sqi, dgene, dpan, dgd.seqs, file=file.path(wdir,wfile))
}
#
# calculate number of mutations in primers and gaps in gene regions
#
if(0)
{
load(file.path(wdir,wfile))
# get differences relative to HXB2 on primer by primer position (IDX)
dpand <- subset(dpan, !is.na(START))[, {
#START <- rep(1894, 23); z<- '1Ra'
z <- PR
psq <- as.character( sqp[!grepl('HXB2',rownames(sqp)), seq.int(START[1], len=length(START))] )
tmp <- subset(dpan, PR==z)[, gsub('\\.','n',paste(SEQ, collapse=''))]
tmp <- unlist(strsplit(tmp,''))
z <- which(tmp=='n')
if(length(z))
{
psq <- psq[, -z]
tmp <- tmp[-z]
}
tmp <- as.data.table( melt( t( t(psq)==tmp ), varnames=c('TAXA','POS') ) )
#sqhxb2i <- which(grepl('HXB2',rownames(psq)))
#tmp <- as.data.table( melt( t( t(psq)==psq[sqhxb2i,] ) ) )
setnames(tmp, 'value', 'NT_DIFF')
tmp2 <- as.data.table( melt( psq=='?', varnames=c('TAXA','POS') ) )
setnames(tmp2, 'value', 'MISS')
tmp <- merge(tmp, tmp2, by=c('TAXA','POS'))
tmp2 <- as.data.table( melt( psq=='n', varnames=c('TAXA','POS') ) )
setnames(tmp2, 'value', 'ANY')
tmp <- merge(tmp, tmp2, by=c('TAXA','POS'))
set(tmp, NULL, 'NT_DIFF', tmp[, as.integer(!NT_DIFF)])
# adjust primer position according to z
while(length(z))
{
tmp2<- tmp[, which(POS>=z[1])]
set(tmp, tmp2, 'POS', tmp[tmp2, POS+1L])
z <- z[-1]
}
set(tmp, NULL, 'POS', tmp[, paste('PR_',POS,sep='')])
set(tmp, tmp[, which(MISS)], 'NT_DIFF', NA_integer_)
set(tmp, tmp[, which(ANY)], 'NT_DIFF', NA_integer_)
tmp[, MISS:=NULL]
tmp[, ANY:=NULL]
#tmp
list(TAXA=tmp$TAXA, POS=tmp$POS, NT_DIFF=tmp$NT_DIFF)
}, by='PR' ]
set(dpand, NULL, 'POS', dpand[, factor(POS, levels=paste('PR_',1:dpand[, length(unique(POS))],sep=''))])
set(dpand, NULL, 'TAXA', dpand[, as.character(TAXA)])
set(dpand, NULL, 'POS', dpand[, as.character(POS)])
# update 1R
tmp <- dcast.data.table(subset(dpand, PR=='1Ra' | PR=='1Rb'), TAXA+POS~PR, value.var='NT_DIFF')
setnames(tmp, c('1Ra','1Rb'), c('pr1Ra','pr1Rb'))
tmp[, NT_DIFF:= pr1Ra*pr1Rb]
tmp[, PR:='1R']
set(tmp, NULL, c('pr1Ra','pr1Rb'), NULL)
dpand <- rbind(subset(dpand, PR!='1R'), tmp)
# update 2F
tmp <- dcast.data.table(subset(dpand, PR=='2Fa' | PR=='2Fb'), TAXA+POS~PR, value.var='NT_DIFF')
setnames(tmp, c('2Fa','2Fb'), c('pr2Fa','pr2Fb'))
tmp[, NT_DIFF:= pr2Fa*pr2Fb]
tmp[, PR:='2F']
set(tmp, NULL, c('pr2Fa','pr2Fb'), NULL)
dpand <- rbind(subset(dpand, PR!='2F'), tmp)
# update 2R
tmp <- dcast.data.table(subset(dpand, PR=='2Ra' | PR=='2Rb'), TAXA+POS~PR, value.var='NT_DIFF')
setnames(tmp, c('2Ra','2Rb'), c('pr2Ra','pr2Rb'))
tmp[, NT_DIFF:= pr2Ra*pr2Rb]
tmp[, PR:='2R']
set(tmp, NULL, c('pr2Ra','pr2Rb'), NULL)
dpand <- rbind(subset(dpand, PR!='2R'), tmp)
#
# calculate number of '?' in each of the gene regions
#
z <- as.character( sqp )
dgd <- dgene[, {
#START<- 812; END_B4NXT<- 4341
tmp <- z[, seq.int(START, END)]
tmp <- apply(tmp=='?', 1, sum)
list( TAXA= names(tmp), UNASS= tmp/(END-START+1L) )
}, by=c('GENE','START','END','LEN')]
# add number not '?','n','-'
tmp <- dgene[, {
#START<- 812; END_B4NXT<- 4341
tmp <- z[, seq.int(START, END)]
tmp <- apply(!(tmp=='?'|tmp=='n'|tmp=='-'), 1, sum)
list( TAXA= names(tmp), ACTG= tmp/(END-START+1L) )
}, by=c('GENE','START','END')]
dgd <- merge(dgd, tmp, by=c('GENE','START','END','TAXA'))
dgd <- subset(dgd, !grepl('HXB2',TAXA))
#
save(sq, sqp, sqi, dgene, dgenec, dpan, dpand, dgd, dgd.seqs, file=file.path(wdir,wfile))
}
#
# get COMET subtypes
#
if(0)
{
infile <- "~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/explaingaps/PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment_region_1F1R_COMETv0.5.txt"
dc <- as.data.table(read.table(infile, header=TRUE, sep='\t', stringsAsFactors=FALSE, strip.white=FALSE))
dc[, COMET_R:='COMET_1F1R']
dc[, COMET_V:='0.5']
infile <- "~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/explaingaps/PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment_region_3F4F_COMETv0.5.txt"
tmp <- as.data.table(read.table(infile, header=TRUE, sep='\t', stringsAsFactors=FALSE, strip.white=FALSE))
tmp[, COMET_R:='COMET_3F4F']
tmp[, COMET_V:='0.5']
dc <- rbind(dc,tmp)
infile <- "~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/explaingaps/PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment_region_4F3R_COMETv0.5.txt"
tmp <- as.data.table(read.table(infile, header=TRUE, sep='\t', stringsAsFactors=FALSE, strip.white=FALSE))
tmp[, COMET_R:='COMET_4F3R']
tmp[, COMET_V:='0.5']
dc <- rbind(dc,tmp)
infile <- "~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/explaingaps/PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment_region_1F1R_COMETv2.1.txt"
tmp <- as.data.table(read.table(infile, header=TRUE, sep='\t', stringsAsFactors=FALSE, strip.white=FALSE))
tmp[, COMET_R:='COMET_1F1R']
tmp[, COMET_V:='2.1']
dc <- rbind(dc,tmp,use.name=TRUE,fill=TRUE)
infile <- "~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/explaingaps/PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment_region_3F4F_COMETv2.1.txt"
tmp <- as.data.table(read.table(infile, header=TRUE, sep='\t', stringsAsFactors=FALSE, strip.white=FALSE))
tmp[, COMET_R:='COMET_3F4F']
tmp[, COMET_V:='2.1']
dc <- rbind(dc,tmp,use.name=TRUE,fill=TRUE)
infile <- "~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/explaingaps/PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment_region_4F3R_COMETv2.1.txt"
tmp <- as.data.table(read.table(infile, header=TRUE, sep='\t', stringsAsFactors=FALSE, strip.white=FALSE))
tmp[, COMET_R:='COMET_4F3R']
tmp[, COMET_V:='2.1']
dc <- rbind(dc,tmp,use.name=TRUE,fill=TRUE)
infile <- "~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/explaingaps/PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment_UGfull_COMETv2.1.txt"
tmp <- as.data.table(read.table(infile, header=TRUE, sep='\t', stringsAsFactors=FALSE, strip.white=FALSE))
tmp[, COMET_R:='COMET_full']
tmp[, COMET_V:='2.1']
dc <- rbind(dc,tmp,use.name=TRUE,fill=TRUE)
infile <- "~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/explaingaps/PANGEA_HIV_n4562_Imperial_v151113_GlobalAlignment_ZABWfull_COMETv2.1.txt"
tmp <- as.data.table(read.table(infile, header=TRUE, sep='\t', stringsAsFactors=FALSE, strip.white=FALSE))
tmp[, COMET_R:='COMET_full']
tmp[, COMET_V:='2.1']
dc <- rbind(dc,tmp,use.name=TRUE,fill=TRUE)
dc[, x:=NULL]
setnames(dc, c('bootstrap.support','name','subtype','length'), c('COMET_BS','TAXA','COMET_ST','COMET_N'))
dc <- subset(dc, !is.na(TAXA))
dc <- subset(dc, !grepl('HXB2',TAXA))
# reset length from dgd
tmp <- subset(dgd, GENE%in%c('START-1R','3F-4F','4F-3R','GAG+POL+ENV'))
set(tmp,NULL,'ACTG',tmp[,ACTG*LEN])
set(tmp,tmp[, which(GENE=='START-1R')],'GENE','COMET_1F1R')
set(tmp,tmp[, which(GENE=='3F-4F')],'GENE','COMET_3F4F')
set(tmp,tmp[, which(GENE=='4F-3R')],'GENE','COMET_4F3R')
set(tmp,tmp[, which(GENE=='GAG+POL+ENV')],'GENE','COMET_full')
setnames(tmp,c('GENE','ACTG'),c('COMET_R','COMET_ACTG'))
dc <- merge(subset(dc,select=c('TAXA','COMET_ST','COMET_R','COMET_N','COMET_V')),subset(tmp,select=c('TAXA','COMET_R','COMET_ACTG')),by=c('TAXA','COMET_R'))
if(0)
{
set(dc, dc[, which(grepl('unassigned', COMET_ST))], 'COMET_ST', 'unassigned')
set(dc, dc[, which(grepl('_', COMET_ST))], 'COMET_ST', 'pot_recombinant')
set(dc, dc[, which(!COMET_ST%in%c('A1','B','C','D','unassigned','pot_recombinant'))], 'COMET_ST', 'other')
}
if(1) #based on observation that on full genome, all recombinants are 'unassigned'
{
set(dc, dc[, which(grepl('unassigned', COMET_ST))], 'COMET_ST', 'pot_recombinant')
set(dc, dc[, which(grepl('_', COMET_ST))], 'COMET_ST', 'pot_recombinant')
set(dc, dc[, which(!COMET_ST%in%c('A1','B','C','D','pot_recombinant'))], 'COMET_ST', 'other')
}
# cross-compare assignments: assignments largely agree,
# except that COMET2.1 now makes more assignments on short sequences, that are mostly assigned A
dc[, table(COMET_ST,COMET_V,COMET_R)]
# use actual length
#set(dc, dc[, which(COMET_N<500 & COMET_V=='0.5')], 'COMET_ST', 'short')
set(dc, dc[, which(COMET_ACTG<500 & COMET_V=='0.5')], 'COMET_ST', 'short')
set(dc, dc[, which(COMET_ACTG<500 & COMET_V=='2.1')], 'COMET_ST', 'short')
# use COMETv2.1
dc <- subset(dc, COMET_V=='2.1',select=c('TAXA','COMET_R','COMET_ST','COMET_ACTG'))
setnames(dc, 'COMET_ACTG','COMET_N')
tmp <- dc[, {
ans <- 'pot_recombinant'
z <- which(!COMET_ST%in%c('short',"unassigned"))
if(all(COMET_ST[z]=="A1"))
ans<- 'A1'
if(all(COMET_ST[z]=="B"))
ans<- 'B'
if(all(COMET_ST[z]=="C"))
ans<- 'C'
if(all(COMET_ST[z]=="D"))
ans<- 'D'
if(all(COMET_ST[z]=="other"))
ans<- 'other'
if(length(which(COMET_ST=="unassigned"))>1)
ans<- 'unassigned'
if(all(COMET_ST=="short"))
ans<- 'short'
list(COMET_R='COMET_CONS',COMET_ST=ans, COMET_N=min(COMET_N))
}, by='TAXA']
dc <- rbind(dc, tmp, use.names=TRUE)
if(0)
{
# check COMET subtype assignments on ZA|BW full
tmp <- merge(dc, subset(dc, COMET_R=='COMET_full' & grepl('ZA|BW',TAXA), TAXA), by='TAXA')
tmp <- dcast.data.table(tmp, TAXA~COMET_R, value.var='COMET_ST')
# check COMET subtype assignments on UG full
tmp <- merge(dc, subset(dc, COMET_R=='COMET_full' & grepl('UG',TAXA), TAXA), by='TAXA')
tmp <- dcast.data.table(tmp, TAXA~COMET_R, value.var='COMET_ST')
tmp[, COMET_CLASS:='agree_pure_subtype']
set(tmp, tmp[, which(COMET_full=='unassigned' & COMET_CONS%in%c('unassigned','pot_recombinant'))], 'COMET_CLASS', 'agree_pot_recombinant')
set(tmp, tmp[, which(COMET_full=='unassigned' & !COMET_CONS%in%c('unassigned','pot_recombinant'))], 'COMET_CLASS', 'by_three_regions:_potentially_wrong_pure_subtype')
set(tmp, tmp[, which(COMET_full!='unassigned' & COMET_CONS%in%c('unassigned','pot_recombinant'))], 'COMET_CLASS', 'by_three_regions:_potentially_wrong_pot_recombinant')
# tmp[, table(COMET_CLASS)]
# agree_pot_recombinant agree_pure_subtype by_three_regions:_potentially_wrong_pot_recombinant by_three_regions:_potentially_wrong_pure_subtype
# 342 343 2 28
# prop of pure subtypes assignments that may be wrong: 28/(343+28)=0.0754717
# prop of pot recombinant and unassigned in 715 from UG: 370/715=0.517
# prop of pot recombinant and unassigned in 3628 from UG: (579+104)/3628=0.188
}
# ignore check analysis on 'COMET_full'
dc <- subset(dc, COMET_R!='COMET_full')
tmp2 <- dcast.data.table(dc, TAXA~COMET_R, value.var='COMET_N')
setnames(tmp2, setdiff(colnames(tmp2),'TAXA'), paste(setdiff(colnames(tmp2),'TAXA'),'_N',sep=''))
dc <- dcast.data.table(dc, TAXA~COMET_R, value.var='COMET_ST')
dc <- merge(dc, tmp2, by='TAXA')
#
save(sq, sqp, sqi, dgene, dgenec, dpan, dpand, dgd, dgd.seqs, dc, file=file.path(wdir,wfile))
}
#
# add meta-data
#
if(0)
{
dm <- unique(subset(dgd, select=TAXA), by='TAXA')
dm[, PANGEA_ID:=gsub('-S.*','',TAXA)]
# add RCCS data
load("~/Dropbox (SPH Imperial College)/Rakai Pangea Meta Data/Data for Fish Analysis Working Group/RakaiPangeaMetaData.rda")
rccsData <- as.data.table(rccsData)
rccsData <- subset(rccsData, select=c('Pangea.id', 'RCCS_studyid', 'date', 'batch', 'birthyr', 'REGION', 'COMM_NUM', 'HH_NUM', 'SEX', 'AGEYRS','firstPosDate', 'arvStartDate', 'selfReportArt', 'everSelfReportArt', 'FirstSelfReportArt', 'recentVL', 'recentVLdate'))
setnames(rccsData, c('Pangea.id','RCCS_studyid','REGION','date','birthyr','firstPosDate','arvStartDate','recentVL','recentVLdate',"AGEYRS"), c('PANGEA_ID','STUDY_ID','LOC','SAMPLEDATE','DOB','FIRSTPOSDATE','ARTSTART','RECENTVL','RECENTVLDATE',"AGE"))
rccsData <- subset(rccsData, !is.na(PANGEA_ID))
setkey(rccsData, PANGEA_ID)
rccsData <- unique(rccsData, by='PANGEA_ID') #remove 4 duplicates "K104085" "E106462" "F030186" "F101874"
rccsData[, COHORT:='RCCS']
set(rccsData, NULL, 'SAMPLEDATE', rccsData[,hivc.db.Date2numeric(SAMPLEDATE)])
set(rccsData, NULL, 'FIRSTPOSDATE', rccsData[,hivc.db.Date2numeric(FIRSTPOSDATE)])
set(rccsData, NULL, 'ARTSTART', rccsData[,hivc.db.Date2numeric(ARTSTART)])
set(rccsData, NULL, 'FirstSelfReportArt', rccsData[,hivc.db.Date2numeric(FirstSelfReportArt)])
set(rccsData, NULL, 'RECENTVLDATE', rccsData[,hivc.db.Date2numeric(RECENTVLDATE)])
# add Mochudi data
load("~/duke/2016_AC/PANGEA_160826/160826_PANGEA_BW_corevariables_n373.rda")
setnames(bwp, 'PANGEAID', 'PANGEA_ID')
set(bwp, NULL, 'SEQID', NULL)
meta <- rbind(rccsData, bwp, use.names=TRUE, fill=TRUE)
# add AC data
load("~/duke/2016_AC/PANGEA_160826/160826_PANGEA_AC_corevariables_n2940.rda")
setnames(acp, 'PANGEAID', 'PANGEA_ID')
set(acp, NULL, c('REASONSAMPLING','LATESTARTREGIMEN','CIRCUMCISED','LASTNEGDATE','LASTNUMSEXUALPARTNERS','LATESTARTREGIMENSTARTED'), NULL)
meta <- rbind(meta, acp, use.names=TRUE, fill=TRUE)
# add metadata from UCL
load('~/Dropbox (SPH Imperial College)/PANGEA_metadata/processed_metadata/PANGEA_meta_161128.rda')
# add info for missing meta-data
tmp <- as.data.table(read.csv('~/Dropbox (SPH Imperial College)/PANGEA_metadata/original_161128/MetaData_161219_Noexistingshareddata_resolved_sites.csv', stringsAsFactors=FALSE))
setnames(tmp, c('PANGEA_ID2','Site'), c('NEW_PANGEA_ID','COHORT_ID'))
tmp <- subset(tmp, select=c('NEW_PANGEA_ID','COHORT_ID'))
dfp <- rbind(dfp, tmp, fill=TRUE)
set(dfp, dfp[, which(GENDER=='')],'GENDER',NA_character_)
# fixup NEW_PANGEA_ID duplicates to own ID
#sqi[, which(grepl('PG15-UG001328',PANGEA_ID2))]
#
# check we have data for all sequences in alignment
#
set(dfp, NULL, 'NEW_PANGEA_ID', dfp[, gsub('-[0-9]+$','',NEW_PANGEA_ID)])
tmp <- data.table(PANGEA_ID2=setdiff( subset(sqi, PNG=='Y')[, PANGEA_ID2], dfp[, NEW_PANGEA_ID] ))
tmp <- merge(sqi, tmp, by='PANGEA_ID2')
#write.csv(tmp, file='~/Dropbox (SPH Imperial College)/PANGEA_metadata/processed_metadata/check_keys_OR_161219.csv')
stopifnot(!nrow(tmp))
#
set(dfp, NULL, c('SEQUENCE','SAMPLE_REASON','ART_REGIMEN','NGS_METHOD','PIPELINE','N_CUT_OFF','LC_CUT_OFF'), NULL)
dfp <- unique(dfp, by=c('NEW_PANGEA_ID','COHORT_ID','DOB_YEAR','GENDER','GEO_COUNTRY','GEO_CITY','SAMPLE_DATE'))
dfp[, ROW:=seq_len(nrow(dfp))]
# reset duplicate NEW_PANGEA_IDs
set(dfp, dfp[, which(grepl('PANGEA-N25977-UG2014',NEW_PANGEA_ID) & DOB_YEAR==1972)], 'NEW_PANGEA_ID', 'PANGEA-N25977OR2-UG2014')
set(dfp, dfp[, which(grepl('PANGEA-K17754-UG2012',NEW_PANGEA_ID) & DOB_YEAR==1979)], 'NEW_PANGEA_ID', 'PANGEA-K17754OR2-UG2012')
set(sqi, sqi[, which(grepl('PG15-UG001328',PANGEA_ID))], 'NEW_PANGEA_ID', 'PANGEA-N25977OR2-UG2014')
set(sqi, sqi[, which(grepl('PG15-UG000698',PANGEA_ID))], 'NEW_PANGEA_ID', 'PANGEA-K17754OR2-UG2012')
# remove arbitrarily with more missing data / unclear DOB or gender
set(dfp, c(7343L, 7225L, 5705L, 7190L, 7438L, 12024L, 7456L, 5606L, 7153L, 12172L), 'NEW_PANGEA_ID', NA_character_)
dfp <- subset(dfp, !is.na(NEW_PANGEA_ID))
dfp <- merge(dfp, dfp[, list(DUMMY2=length(COHORT_ID)), by='NEW_PANGEA_ID'],by='NEW_PANGEA_ID')
# there are still a few duplicates ...
subset(dfp, DUMMY2>1)
#
setnames(dfp, 'NEW_PANGEA_ID', 'PANGEA_ID2')
dfp <- merge(sqi, dfp, by='PANGEA_ID2', all.x=1)
# there is still 1 duplicate but in Rakai data, where I have the actual true data point.
negcontrols <- subset(dfp, grepl('position',COHORT_ID))
dfp <- subset(dfp, grepl('MRC|Rakai',COHORT_ID))
# remove one entry where we also have the Rakai original data
dfp <- subset(dfp, PANGEA_ID2!='PANGEA-V10270-UG2012')
setnames(dfp, c('PANGEA_ID2','TAXA','COHORT_ID','SAMPLE_DATE','DOB_YEAR','GENDER','GEO_COUNTRY','GEO_CITY','ON_ART','CD4_DATE','VL_DATE','VL_U','VL_L','CD4_U','CD4_L'), c('STUDY_ID','TAXA2','COHORT','SAMPLEDATE','DOB','SEX','GEO_COUNTRY','LOC','CURRENTLYONART','RECENTCD4DATE','RECENTVLDATE','RECENTVL_U','RECENTVL_L','RECENTCD4COUNT_U','RECENTCD4COUNT_L'))
set(dfp, NULL, c('DUMMY','COV','PNG','SITE','CD4_RANGE','VL_RANGE'), NULL)
set(dfp, NULL, 'PANGEA_ID', dfp[, gsub('-S[0-9]+$','',PANGEA_ID)])
dfp <- merge(dfp, dfp[, list(DUMMY3=length(COHORT)), by='PANGEA_ID'],by='PANGEA_ID')
# manual check: all remaining PANGEA_ID duplicates have same meta-data and are sequence replicates
subset(dfp, DUMMY3>1)
dfp <- unique(dfp, by='PANGEA_ID')
set(dfp, NULL, c('DUMMY2','DUMMY3','ROW'), NULL)
meta <- rbind(meta, dfp, use.names=TRUE, fill=TRUE)
dm <- merge(dm, meta, by='PANGEA_ID', all.x=1)
#z <- merge(dm, dm[, list(DUMMY3=length(COHORT)), by='TAXA'],by='TAXA')
# add CLASS subtype data
subtypeSummaryData <- as.data.table(subtypeSummaryData)
setnames(subtypeSummaryData, 'RCCS_studyid', 'STUDY_ID')
dst <- subset(subtypeSummaryData, select=c(STUDY_ID, gp.class, gag.class, pol.class, vpu.class, env.class, comp.class))
setkey(dst, STUDY_ID)
dst <- unique(dst, by='STUDY_ID') #no duplicates here
dst <- melt(dst, id.var='STUDY_ID')
set(dst, NULL, 'value', dst[, gsub('\\s','',gsub('Complex','',gsub('Subtype', '',value)))])
set(dst, NULL, 'variable', dst[, paste('SUBTYPE_',toupper(gsub('\\.class','',variable)),sep='')])
dst <- dcast.data.table(dst, STUDY_ID~variable)
dm <- merge(dm, dst, by='STUDY_ID',all.x=1)
# add COMET subtype data
dm <- merge(dm, dc, by='TAXA',all.x=1)
set(dm, dm[, which(is.na(COMET_1F1R))],'COMET_1F1R_N',0L)
set(dm, dm[, which(is.na(COMET_1F1R))],'COMET_1F1R','short')
set(dm, dm[, which(is.na(COMET_3F4F))],'COMET_3F4F_N',0L)
set(dm, dm[, which(is.na(COMET_3F4F))],'COMET_3F4F','short')
set(dm, dm[, which(is.na(COMET_4F3R))],'COMET_4F3R_N',0L)
set(dm, dm[, which(is.na(COMET_4F3R))],'COMET_4F3R','short')
# add Sanger processing data etc
ds <- data.table(read.csv("~/Dropbox (SPH Imperial College)/PANGEA_data/PANGEAconsensuses_2015-09_Imperial/PANGEA_HIV_n4562_Imperial_v150908_Summary.csv"))
setnames(ds, c('Sanger.ID','PANGEA.ID','reference.for.mapping','clinical.genome.coverage'), c('SANGER_ID','PANGEA_ID','REF_4_MAPPING','COV'))
tmp <- data.table(read.csv('~/Dropbox (SPH Imperial College)/PANGEA_data/PAN_iva_dependencies_9861.txt', sep='\t'))
setnames(tmp, 'LaneID', 'SANGER_ID')
set(tmp, NULL, 'SANGER_ID', tmp[, gsub('#','_',SANGER_ID)])
ds <- merge(ds, tmp, by='SANGER_ID', all.x=1)
set(ds, NULL, 'TAXA', ds[, gsub('\\s$','',gsub('^\\s','',as.character(PANGEA_ID)))])
set(ds, NULL, 'PANGEA_ID', ds[, gsub('-S.*','',TAXA)])
# some of the PANGEA IDs are duplicates and cannot be resolved. Use the coverage.
tmp <- as.character(sqp)
tmp2 <- apply(tmp, 1, function(x) sum(!x%in%c('?','-')))
tmp2 <- data.table(TAXA=names(tmp2), PANGEA_ID=gsub('-S.*','',names(tmp2)), COV=tmp2)
z <- subset(tmp2, grepl('-R2',TAXA))
# some manual adjustments to get this to match
set(tmp2, tmp2[, which(TAXA=='PG14-BW000057-S01150-R2')], 'COV', 8061L)
set(tmp2, tmp2[, which(TAXA=='PG14-BW000058-S01151-R2')], 'COV', 8090L)
set(tmp2, tmp2[, which(TAXA=='PG14-BW000059-S01152-R2')], 'COV', 8554L)
set(tmp2, tmp2[, which(TAXA=='PG14-BW000063-S01156-R2')], 'COV', 7891L)
set(tmp2, tmp2[, which(TAXA=='PG14-BW000064-S01157-R2')], 'COV', 7872L)
set(tmp2, tmp2[, which(TAXA=='PG14-UG002291-S00291-R2')], 'COV', 7392L)
set(tmp2, tmp2[, which(TAXA=='PG14-UG500523-S02527-R2')], 'COV', 3496L)
set(tmp2, tmp2[, which(TAXA=='PG14-UG500526-S02530-R2')], 'COV', 7664L)
set(tmp2, tmp2[, which(TAXA=='PG14-UG500526-S02530')], 'COV', 7322L)
set(tmp2, tmp2[, which(TAXA=='PG14-UG500529-S02533-R2')], 'COV', 7910L)
set(tmp2, tmp2[, which(TAXA=='PG14-UG500536-S02540-R2')], 'COV', 8210L)
set(tmp2, tmp2[, which(TAXA=='PG14-UG500541-S02545-R2')], 'COV', 5086L)
z <- subset(tmp2, grepl('-R2',TAXA))
z <- merge(subset(ds, COV>0, select=which(colnames(ds)!='TAXA')), subset(z,grepl("PG",TAXA) & COV>0, select=c(PANGEA_ID,TAXA,COV)), by=c('PANGEA_ID','COV'))
setnames(z, 'TAXA','TAXA_NEW')
ds <- merge(ds, subset(z, select=c(SANGER_ID,TAXA_NEW)), by='SANGER_ID', all.x=1)
tmp2 <- ds[, which(!is.na(TAXA_NEW))]
set(ds, tmp2, 'TAXA', ds[tmp2,TAXA_NEW])
ds <- subset(ds, COV>0)
setkey(ds, TAXA)
set(ds, NULL, c('TAXA_NEW','COV','PANGEA_ID'), NULL)
dm <- merge(dm, ds, by='TAXA', all.x=1)
stopifnot( nrow(subset(dm, is.na(SANGER_ID)))==0 )
# remove neg controls
stopifnot( length(setdiff(dm[, SANGER_ID], dgd.seqs[, sort(unique(SANGER_ID))]))==0 )
#dgd.seqs.backup <- copy(dgd.seqs)
dgd.seqs <- merge(dgd.seqs, subset(dm, select=c(TAXA, PANGEA_ID, SANGER_ID)), by='SANGER_ID')
dm <- subset(dm, !TAXA%in%negcontrols$TAXA)
sq <- sq[ !rownames(sq)%in%negcontrols$TAXA, ]
sqp <- sqp[ !rownames(sqp)%in%negcontrols$TAXA, ]
sqi <- subset(sqi, !TAXA%in%negcontrols$TAXA)
sqi[, DUMMY:=NULL]
dpand <- subset(dpand, !TAXA%in%negcontrols$TAXA)
dgd <- subset(dgd, !TAXA%in%negcontrols$TAXA)
dgd.seqs <- subset(dgd.seqs, !TAXA%in%negcontrols$TAXA)
# remove Rakai Test Plate
tmp <- subset(dm, COHORT=='Rakai Test Plate')[, TAXA]
dm <- subset(dm, COHORT!='Rakai Test Plate')
sq <- sq[ !rownames(sq)%in%tmp, ]
sqp <- sqp[ !rownames(sqp)%in%tmp, ]
sqi <- subset(sqi, !TAXA%in%tmp)
dpand <- subset(dpand, !TAXA%in%tmp)
dgd <- subset(dgd, !TAXA%in%tmp)
dgd.seqs <- subset(dgd.seqs, !TAXA%in%tmp)
# complete COHORT
set(dm, dm[, which(grepl('ZA',PANGEA_ID) & is.na(COHORT))], 'COHORT', 'AC_Resistance')
stopifnot( !nrow(subset(dm, is.na(COHORT))) )
set(dm, dm[, which(COHORT=='Rakai')], 'COHORT', 'RCCS')
# add extraction ID
dm[, EXTRACT_ID:= dm[,gsub('-S','',regmatches(TAXA,regexpr('-S[0-9]+', TAXA)))]]
# save
save(sq, sqp, sqi, dgene, dpan, dpand, dgd, dgd.seqs, dm, file=file.path(wdir,wfile))
}
}
##--------------------------------------------------------------------------------------------------------
## olli 27.06.11
treecomparison.bootstrap.mvr.dev<- function(indir=NULL, wdir=NULL)
{
require(ape)
require(data.table)
require(recosystem)
require(ggplot2)
# get master RDA file with all distances
if(0)
{
wdir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/tree_mvr'
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/simulations'
infile <- '150701_Regional_TRAIN4_SIMULATED.fa'
#infile <- '150701_Regional_TRAIN2_SIMULATED.fa'
# create tp with IDs -- need this to complete to matrix
seq <- read.dna(file.path(indir, infile), format='fa')
tp <- as.data.table( t(combn(rownames(seq),2)) )
setnames(tp, c('V1','V2'), c('TAXA1','TAXA2'))
tmp <- as.data.table( t(combn(seq_len(nrow(seq)),2)) )
setnames(tmp, c('V1','V2'), c('ID1','ID2'))
tp <- cbind(tp, tmp)
#
# read genetic distances between taxon pairs
#
infiles <- data.table(FILE=list.files(wdir, pattern='BATCH[0-9]+.rda$', full.names=TRUE))
infiles[, BATCH:= as.integer(gsub('BATCH','',regmatches(FILE, regexpr('BATCH[0-9]+', FILE))))]
setkey(infiles, BATCH)
# not yet completed
stopifnot( infiles[, length(setdiff(seq.int(1,400), BATCH))==0] )
# read files
tmp <- lapply(infiles[, FILE], function(x)
{
load(x)
ans <- merge(tp, tpi, by=c('TAXA1','TAXA2'))
ans
})
tp <- do.call('rbind', tmp)
setkey(tp, ID1, ID2)
tp[, GD_V:= GD_SD*GD_SD]
#
# save tp to file
#
save(tp, seq, file=file.path(wdir, gsub('\\.fa','_tps.rda',infile)))
}
if(0)
{
wdir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/tree_mvr'
df <- data.table(FILE=list.files(wdir, pattern='newick$',full.names=1))
tmp <- df[, {
ph2 <- read.tree(FILE)
list(CH= nrow(unique(data.table(TAXA=ph2$tip.label)))==Ntip(ph2) )
}, by='FILE']
}
if(0)
{
na.rm.p <- NA
complete.distance.matrix <- 0
seed <- 123
v.mult <- 1.2
reco.opts <- c(dim=750, costp_l1=0, costp_l2=0.001, costq_l1=0, costq_l2=0.001, nthread=1, lrate=0.003, niter=120)
verbose <- 1
#wdir <- '/work/or105/Gates_2014/tree_comparison/mvr'
wdir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/tree_mvr'
infile <- '150701_Regional_TRAIN4_SIMULATED_tps.rda'
#infile <- '150701_Regional_TRAIN2_SIMULATED_tps.rda'
load(file.path(wdir, infile))
loop.rep <- tp[, unique(REP)]
loop.gene <- tp[, unique(GENE)]
loop.gene <- "gag+pol+env"
for(gene in loop.gene)
for(rep in loop.rep)
{
#gene <- "env"; rep<- 1
#gene <- "gag+pol+env"; rep<- 1
tps <- subset(tp, GENE==gene & REP==rep)
outfile <- file.path(wdir, gsub('\\.rda',paste('_GENE_',gene,'_REP_',rep,'_C_',complete.distance.matrix,sep=''), infile))
tmp <- seq.mvr.d.and.v(tps, seed=seed, v.mult=v.mult, complete.distance.matrix=complete.distance.matrix, reco.opts=reco.opts, outfile=outfile, verbose=verbose)
d <- tmp$d
v <- tmp$v
tmp <- NULL
gc()
# write to file
d <- as.matrix(d)
v <- as.matrix(v)
d[is.na(d)] <- -1
v[is.na(v)] <- -1
file.d <- paste(gsub('\\.rda|\\.newick|\\.tree','',outfile),'_d.phylip',sep='')
file.v <- paste(gsub('\\.rda|\\.newick|\\.tree','',outfile),'_v.phylip',sep='')
seq.write.dna.phylip.triangular(d, file=file.d)
seq.write.dna.phylip.triangular(v, file=file.v)
# call PhyD*
tmp <- cmd.phydstar(file.d, outfile=outfile, method='BioNJ', fs=15, binary=TRUE, negative.branch.length=FALSE, lower.triangular=TRUE)
cat(tmp)
tmp <- cmd.phydstar(file.d, outfile=outfile, infile.v=file.v, method='MVR', fs=15, binary=TRUE, negative.branch.length=FALSE, lower.triangular=TRUE)
system(tmp)
outfile <- paste(outfile,'_',paste(reco.opts,collapse='_'),'_mvr.newick',sep='')
options(expressions=5e5)
write.tree(ph, file=outfile)
options(expressions=5e3)
}
quit('no')
}
if(0)
{
tps <- subset(tp, REP==1 & GENE=='gag+pol+env')
tmp <- dcast.data.table(tps, ID1~ID2, value.var='GD')
d <- cbind(NA_real_, as.matrix(tmp[, -1, with=FALSE]))
d <- rbind(d, NA_real_)
colnames(d)[1] <- setdiff( as.character(tmp[, ID1]), colnames(d) )
rownames(d) <- colnames(d)
diag(d) <- 0
# complete lower triangular from upper triangular and vice versa
tmp <- lower.tri(d) & is.na(d)
d[tmp] <- t(d)[tmp]
tmp <- upper.tri(d) & is.na(d)
d[tmp] <- t(d)[tmp]
# reset names
tmp <- subset( tps, select=c(TAXA1, ID1) )
setnames(tmp, c('TAXA1','ID1'), c('TAXA2','ID2') )
tmp <- unique(rbind( tmp, subset( tps, select=c(TAXA2, ID2) ) ))
setnames(tmp, c('TAXA2','ID2'), c('TAXA','ID') )
setkey(tmp, ID)
rownames(d) <- tmp[, TAXA]
colnames(d) <- tmp[, TAXA]
# checks
stopifnot(ncol(d)==nrow(d))
stopifnot(length(which(is.na(d)))==2*nrow(subset(tps, is.na(GD))))
cat('D matrix: proportion of NA entries=',length(which(is.na(d))) / prod(dim(d)))
#
# get variance matrix
#
tmp <- dcast.data.table(tps, ID1~ID2, value.var='GD_V')
v <- cbind(NA_real_, as.matrix(tmp[, -1, with=FALSE]))
v <- rbind(v, NA_real_)
colnames(v)[1] <- setdiff( as.character(tmp[, ID1]), colnames(v) )
rownames(v) <- colnames(v)
diag(v) <- 0
# complete lower triangular from upper triangular and vice versa
tmp <- lower.tri(v) & is.na(v)
v[tmp] <- t(v)[tmp]
tmp <- upper.tri(v) & is.na(v)
v[tmp] <- t(v)[tmp]
# reset names
tmp <- subset( tps, select=c(TAXA1, ID1) )
setnames(tmp, c('TAXA1','ID1'), c('TAXA2','ID2') )
tmp <- unique(rbind( tmp, subset( tps, select=c(TAXA2, ID2) ) ))
setnames(tmp, c('TAXA2','ID2'), c('TAXA','ID') )
setkey(tmp, ID)
rownames(v) <- tmp[, TAXA]
colnames(v) <- tmp[, TAXA]
# checks
stopifnot(ncol(v)==nrow(v))
cat('V matrix: proportion of NA entries=',length(which(is.na(v))) / prod(dim(v)))
#
# remove cols/rows that contain nothing else than NAs
#
diag(d) <- NA_real_
diag(v) <- NA_real_
tmp <- apply(d, 1, function(x) !all(is.na(x)))
cat('\nIn D: found',length(which(!tmp)),'columns / rows with NA only: remove in D and V. ', rownames(d)[!tmp])
ds <- d[tmp,tmp]
vs <- v[tmp,tmp]
tmp <- apply(vs, 1, function(x) !all(is.na(x)))
cat('\nIn V: found additional',length(which(!tmp)),'columns / rows with NA only: remove in D and V too. ', rownames(d)[!tmp])
ds <- ds[tmp,tmp]
vs <- vs[tmp,tmp]
}
if(0)
{
wdir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/tree_mvr'
load(file.path(wdir, '150701_Regional_TRAIN4_SIMULATED_tps.rda'))
tps <- subset(tp, REP==1 & GENE=='gag+pol+env', select=c(ID1, ID2, GD))
# add upper triangular
tmp <- copy(tps)
set(tmp, NULL, 'ID1', tps[, ID2])
set(tmp, NULL, 'ID2', tps[, ID1])
tps <- rbind(tps, tmp)
# add zero diagonal
tmp <- tps[, range(ID1)]
tmp <- data.table(ID1= seq.int(tmp[1], tmp[2]), ID2= seq.int(tmp[1], tmp[2]), GD=0)
tps <- rbind(tps, tmp)
# ignore NA entries
tps.na <- subset(tps, is.na(GD))
tps <- subset(tps, !is.na(GD))
tps.all <- rbind(tps, tps.na)
# setup matrix completion
tmp <- data_memory(tps[,ID1], tps[,ID2], rating=tps[,GD], index1=TRUE)
set.seed(123)
r <- Reco()
opts <- r$tune(tmp, opts=list(dim=c(10, 100, 500, 750), lrate=c(0.01), costp_l1=0, costp_l2=c(0.001, 0.01, 0.1), costq_l1=0, costq_l2=c(0.001, 0.01, 0.1), nthread=1, niter=10))
#best is dim=750 costp_l2=0.001 costq_l2=0.001 lrate=0.01 rmse=0.01457322
opts <- r$tune(tmp, opts=list(dim=c(500, 750, 1000), lrate=c(0.001, 0.01), costp_l1=0, costp_l2=c(0.0001, 0.001), costq_l1=0, costq_l2=c(0.0001, 0.001), nthread=1, niter=10))
#best is dim=1000 costp_l2=0.0001 costq_l2=0.0001 lrate=0.01 rmse=0.01457322
opts <- r$tune(tmp, opts=list(dim=c(100, 500), lrate=c(0.003, 0.005), costp_l1=0, costp_l2=c(0.01), costq_l1=0, costq_l2=c(0.01), nthread=1, niter=100))
r$train(tmp, opts=c(dim=500, costp_l1=0, costp_l2=0.01, costq_l1=0, costq_l2=0.01, nthread=1, lrate=0.003, niter=40))
#rmse 0.0211
tps.all[, GDp:= r$predict(data_memory(tps.all[,ID1], tps.all[,ID2], index1=TRUE), out_memory())]
#plot
ggplot(subset(tps.all, !is.na(GD)), aes(x=GD, y=GDp)) + geom_point(colour='grey80', size=0.5, pch=16) + geom_abline(slope=1, intercept=0)
ggsave(file=file.path(wdir, 'reco_500_1e-2_3e-3_40.pdf'), w=7, h=7)
r$train(tmp, opts=c(dim=750, costp_l1=0, costp_l2=0.001, costq_l1=0, costq_l2=0.001, nthread=1, lrate=0.003, niter=120))
#rmse 0.0155
tps.all[, GDp2:= r$predict(data_memory(tps.all[,ID1], tps.all[,ID2], index1=TRUE), out_memory())]
#plot
ggplot(subset(tps.all, !is.na(GD)), aes(x=GD, y=GDp2)) + geom_point(colour='grey80', size=0.5, pch=16) + geom_abline(slope=1, intercept=0)
ggsave(file=file.path(wdir, 'reco_750_1e-3_3e-3_120.pdf'), w=7, h=7)
# fill in distance matrix
tps.all[, GDf:= GD]
tmp <- tps.all[, which(is.na(GDf))]
set(tps.all, tmp, 'GDf', tps.all[tmp, GDp2])
# convert to matrix (not necessarily symmetric)
tmp <- dcast.data.table( subset(tps.all, select=c(ID1,ID2,GDf)), ID1~ID2, value.var='GDf' )
d <- as.matrix(tmp[, -1, with=FALSE])
rownames(d) <- colnames(d)
# make symmetric
d <- (d+t(d))/2
# some rows/cols may have NAs only -- remove these as the matrix completion problem is ill-specified for these
tmp <- subset(tps.na[, list(GDM=length(GD)), by='ID1'], GDM==nrow(d)-1) #subtract one since diagonal is zero
tmp <- setdiff(rownames(d), tmp[, as.character(ID1)] )
d <- d[tmp, tmp]
#
# generate variance matrix
#
tps <- subset(tp, REP==1 & GENE=='gag+pol+env', select=c(TAXA1, ID1, TAXA2, ID2, GD_V))
tmp <- dcast.data.table(tps, ID1~ID2, value.var='GD_V')
v <- cbind(NA_real_, as.matrix(tmp[, -1, with=FALSE]))
v <- rbind(v, NA_real_)
colnames(v)[1] <- setdiff( as.character(tmp[, ID1]), colnames(v) )
rownames(v) <- colnames(v)
diag(v) <- 0
# complete lower triangular from upper triangular and vice versa
tmp <- lower.tri(v) & is.na(v)
v[tmp] <- t(v)[tmp]
tmp <- upper.tri(v) & is.na(v)
v[tmp] <- t(v)[tmp]
# set missing variances to large default
v[is.na(v)] <- max(v, na.rm=TRUE)*1.2
v <- v[rownames(d),colnames(d)]
#
# reset names
#
tmp <- subset( tps, select=c(TAXA1, ID1) )
setnames(tmp, c('TAXA1','ID1'), c('TAXA2','ID2') )
tmp <- unique(rbind( tmp, subset( tps, select=c(TAXA2, ID2) ) ))
setnames(tmp, c('TAXA2','ID2'), c('TAXA','ID') )
tmp <- merge(tmp, data.table(ID=as.integer(rownames(d))), by='ID')
setkey(tmp, ID)
rownames(d) <- tmp[, TAXA]
colnames(d) <- tmp[, TAXA]
rownames(v) <- tmp[, TAXA]
colnames(v) <- tmp[, TAXA]
#
# run mvr with completed distance and variance matrices
#
d <- as.dist(d)
v <- as.dist(v)
ph <- mvr(d, v)
}
if(0) #play with basic recosystem example
{
# this is the example
train_set <- data_file(system.file("dat", "smalltrain.txt", package = "recosystem"))
test_set <- data_file(system.file("dat", "smalltest.txt", package = "recosystem"))
set.seed(123)
r <- Reco()
opts <- r$tune(train_set, opts=list(dim=c(10, 20, 30), lrate=c(0.1, 0.2), costp_l1=0, costq_l1=0, nthread=1, niter=10))
r$train(train_set, opts = c(opts$min, nthread = 1, niter = 20))
pred_rvec <- r$predict(test_set, out_memory())
test <- read.table(test_set@source, sep = " ", header = FALSE)
# this is the same example but from memory
infile <- system.file("dat", "smalltrain.txt", package = "recosystem")
dm <- as.data.table(read.table(file=infile, sep=' '))
setnames(dm, c('V1','V2','V3'), c('IDX1','IDX2','D'))
infile <- system.file("dat", "smalltest.txt", package = "recosystem")
dt <- as.data.table(read.table(file=infile, sep=' '))
setnames(dt, c('V1','V2'), c('IDX1','IDX2'))
dm.eco <- data_memory(dm[,IDX1], dm[,IDX2], rating=dm[,D], index1=FALSE)
dt.eco <- data_memory(dt[,IDX1], dt[,IDX2], index1=FALSE)
set.seed(123)
r <- Reco()
opts2 <- r$tune(dm.eco, opts=list(dim=c(10, 20, 30), lrate=c(0.1, 0.2), costp_l1=0, costq_l1=0, nthread=1, niter=10))
r$train(dm.eco, opts = c(opts2$min, nthread = 1, niter = 20))
pred_rvec2 <- r$predict(dt.eco, out_memory())
stopifnot( length(which( pred_rvec!=pred_rvec2 ))==0 ) #OK this works
# this is the same example but from memory and with ordered entries
infile <- system.file("dat", "smalltrain.txt", package = "recosystem")
dm <- as.data.table(read.table(file=infile, sep=' '))
setnames(dm, c('V1','V2','V3'), c('IDX1','IDX2','D'))
infile <- system.file("dat", "smalltest.txt", package = "recosystem")
dt <- as.data.table(read.table(file=infile, sep=' '))
setnames(dt, c('V1','V2'), c('IDX1','IDX2'))
setkey(dm, IDX1, IDX2)
setkey(dt, IDX1, IDX2)
dm.eco <- data_memory(dm[,IDX1], dm[,IDX2], rating=dm[,D], index1=FALSE)
dt.eco <- data_memory(dt[,IDX1], dt[,IDX2], index1=FALSE)
set.seed(123)
r <- Reco()
opts3 <- r$tune(dm.eco, opts=list(dim=c(10, 20, 30), lrate=c(0.1, 0.2), costp_l1=0, costq_l1=0, nthread=1, niter=10))
r$train(dm.eco, opts = c(opts3$min, nthread = 1, niter = 20))
pred_rvec3 <- r$predict(dt.eco, out_memory())
stopifnot( length(which( pred_rvec!=pred_rvec3 ))==0 ) #not identical
# reproduce RMSE manually
infile <- system.file("dat", "smalltrain.txt", package = "recosystem")
dm <- as.data.table(read.table(file=infile, sep=' '))
setnames(dm, c('V1','V2','V3'), c('IDX1','IDX2','D'))
dm.eco <- data_memory(dm[,IDX1], dm[,IDX2], rating=dm[,D], index1=FALSE)
infile <- system.file("dat", "smalltest.txt", package = "recosystem")
dt <- copy(dm)
dt[, D:=NULL]
dt.eco <- data_memory(dt[,IDX1], dt[,IDX2], index1=FALSE)
set.seed(123)
r <- Reco()
opts4 <- r$tune(dm.eco, opts=list(dim=c(10, 20, 30), lrate=c(0.1, 0.2), costp_l1=0, costq_l1=0, nthread=1, niter=10))
r$train(dm.eco, opts = c(opts4$min, nthread = 1, niter = 20)) #RMSE improves with niter
dm[, PREDICT:= r$predict(dt.eco, out_memory())]
subset(dm, !is.na(D))[, sqrt(mean((D-PREDICT)*(D-PREDICT)))] #OK this works
# reproduce RMSE manually also when entries are ordered and all entries to be predicted?
# note: diagonal is not automatically considered zero,
# and the matrix is not necessarily symmetric either!
infile <- system.file("dat", "smalltrain.txt", package = "recosystem")
dm <- as.data.table(read.table(file=infile, sep=' '))
setnames(dm, c('V1','V2','V3'), c('IDX1','IDX2','D'))
setkey(dm, IDX1, IDX2)
dm.eco <- data_memory(dm[,IDX1], dm[,IDX2], rating=dm[,D], index1=FALSE)
tmp <- dm[, range(IDX1)]
dp <- as.data.table(expand.grid(IDX1=seq.int(tmp[1],tmp[2]), IDX2=seq.int(tmp[1],tmp[2])))
tmp <- data_memory(dp[,IDX1], dp[,IDX2], index1=FALSE)
set.seed(123)
r <- Reco()
opts <- r$tune(dm.eco, opts=list(dim=c(10, 20, 30), lrate=c(0.1, 0.2), costp_l1=0, costq_l1=0, nthread=1, niter=10))
r$train(dm.eco, opts = c(opts$min, nthread = 1, niter = 20))
dp[, PREDICT:= r$predict(tmp, out_memory())]
dp <- merge(dp, dm, by=c('IDX1','IDX2'), all.x=1)
subset(dp, !is.na(D))[, sqrt(mean((D-PREDICT)*(D-PREDICT)))] #OK this works too
}
}
##--------------------------------------------------------------------------------------------------------
## olli 27.06.11
seq.big.mvr<- function(tps, na.rm.p=NA, mds.args=list('ndim'= 750, type="mspline", "spline.intKnots"=3, "spline.degree"=2), wfile=NA)
{
require(smacof)
# select (rep 1 gag+pol+env)
tps <- subset(tp, REP==1 & GENE=='gag+pol+env')
#
# get distance matrix
#
tmp <- dcast.data.table(tps, ID1~ID2, value.var='GD')
d <- cbind(NA_real_, as.matrix(tmp[, -1, with=FALSE]))
d <- rbind(d, NA_real_)
colnames(d)[1] <- setdiff( as.character(tmp[, ID1]), colnames(d) )
rownames(d) <- colnames(d)
diag(d) <- 0
# complete lower triangular from upper triangular and vice versa
tmp <- lower.tri(d) & is.na(d)
d[tmp] <- t(d)[tmp]
tmp <- upper.tri(d) & is.na(d)
d[tmp] <- t(d)[tmp]
# reset names
tmp <- subset( tps, select=c(TAXA1, ID1) )
setnames(tmp, c('TAXA1','ID1'), c('TAXA2','ID2') )
tmp <- unique(rbind( tmp, subset( tps, select=c(TAXA2, ID2) ) ))
setnames(tmp, c('TAXA2','ID2'), c('TAXA','ID') )
setkey(tmp, ID)
rownames(d) <- tmp[, TAXA]
colnames(d) <- tmp[, TAXA]
# checks
stopifnot(ncol(d)==nrow(d))
stopifnot(length(which(is.na(d)))==2*nrow(subset(tps, is.na(GD))))
cat('D matrix: proportion of NA entries=',length(which(is.na(d))) / prod(dim(d)))
#
# get variance matrix
#
tmp <- dcast.data.table(tps, ID1~ID2, value.var='GD_V')
v <- cbind(NA_real_, as.matrix(tmp[, -1, with=FALSE]))
v <- rbind(v, NA_real_)
colnames(v)[1] <- setdiff( as.character(tmp[, ID1]), colnames(v) )
rownames(v) <- colnames(v)
diag(v) <- 0
# complete lower triangular from upper triangular and vice versa
tmp <- lower.tri(v) & is.na(v)
v[tmp] <- t(v)[tmp]
tmp <- upper.tri(v) & is.na(v)
v[tmp] <- t(v)[tmp]
# reset names
tmp <- subset( tps, select=c(TAXA1, ID1) )
setnames(tmp, c('TAXA1','ID1'), c('TAXA2','ID2') )
tmp <- unique(rbind( tmp, subset( tps, select=c(TAXA2, ID2) ) ))
setnames(tmp, c('TAXA2','ID2'), c('TAXA','ID') )
setkey(tmp, ID)
rownames(v) <- tmp[, TAXA]
colnames(v) <- tmp[, TAXA]
# checks
stopifnot(ncol(v)==nrow(v))
cat('V matrix: proportion of NA entries=',length(which(is.na(v))) / prod(dim(v)))
#
# remove cols/rows that contain nothing else than NAs
#
diag(d) <- NA_real_
diag(v) <- NA_real_
tmp <- apply(d, 1, function(x) !all(is.na(x)))
cat('\nIn D: found',length(which(!tmp)),'columns / rows with NA only: remove in D and V. ', rownames(d)[!tmp])
ds <- d[tmp,tmp]
vs <- v[tmp,tmp]
tmp <- apply(vs, 1, function(x) !all(is.na(x)))
cat('\nIn V: found additional',length(which(!tmp)),'columns / rows with NA only: remove in D and V too. ', rownames(d)[!tmp])
ds <- ds[tmp,tmp]
vs <- vs[tmp,tmp]
#
# remove cols/rows that contain more than 10% NAs
#
if(!is.na(na.rm.p))
{
tmp <- apply(ds, 1, function(x) length(which(is.na(x))) ) / ncol(ds)
tmp <- which(tmp>na.rm.p)
cat('\nIn D: found',length(tmp),'columns / rows with more than',na.rm.p*100,'% NAs: remove in D and V. ', rownames(ds)[tmp])
ds <- ds[-tmp,-tmp]
vs <- vs[-tmp,-tmp]
}
#
# do MDS to impute missing distances since mvrs fails too often
#
attach(mds.args)
diag(ds) <- 0
#mds.fit <- mds(ds, ...)
mds.fit <- mds(ds, ndim=ndim, type=type, spline.intKnots=spline.intKnots, spline.degree=spline.degree)
if(!is.na(wfile))
{
pdf(file=paste(wfile,'.shephard.pdf',sep=''), width=5, height=5)
plot(mds.fit, plot.type = "Shepard")
dev.off()
}
mds.ifit <- inverseMDS(mds.fit$conf)
if(!is.na(wfile))
{
save(tps, ds, vs, mds.fit, mds.ifit, mds.args, file=paste(wfile,'mds.rda',sep=''))
}
# unfinished
#rnd.stress <- mean( randomstress(n=1591, ndim=500, nrep=100) ) #even just one iteration takes forever
# njs(ds, fs = 15) # runs fine
# ph <- mvrs(ds, vs, fs = 15)
}
##--------------------------------------------------------------------------------------------------------
## olli 27.06.11
treecomparison.bootstrap.sd.vs.coverage<- function(indir=NULL, wdir=NULL)
{
require(ape)
require(data.table)
require(ggplot2)
batch.n <- 3200
if(is.null(indir) | is.null(wdir))
{
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/simulations'
wdir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/tree_mvr'
}
infile <- '150701_Regional_TRAIN4_SIMULATED.fa'
seq <- read.dna(file.path(indir, infile), format='fa')
seqi <- as.data.table(read.csv(file.path(indir, gsub('\\.fa','_gene.txt',infile)), header=0))
seqi[, GENE:= regmatches(V2, regexpr('[a-z]+', V2))]
seqi[, START:= as.integer(gsub('-','',regmatches(V2, regexpr('[0-9]+-', V2))))]
seqi[, END:= as.integer(gsub('-','',regmatches(V2, regexpr('-[0-9]+', V2))))]
seqi[, GENE_L:=END-START+1L]
seqi <- subset(seqi, select=c(GENE,START,END,GENE_L))
seqi <- rbind(seqi, data.table(GENE='gag+pol+env', START=1L, END=seqi[, max(END)], GENE_L=seqi[, max(END)]))
load(file.path(wdir,'150701_Regional_TRAIN4_SIMULATED_tps.rda'))
tp <- merge(tp, subset(seqi, select=c(GENE, GENE_L)), by='GENE')
set(tp, NULL, 'GENE', tp[, factor(GENE, levels=c('gag','pol','env','gag+pol+env'), labels=c('gag','pol','env','gag+pol+env'))])
#
# do we have higher bootstrap variance if there are more gaps by gene?
#
tmp <- subset(tp, REP==1)
ggplot( subset(tmp, GD_MEAN>0), aes(x=cut(DO/GENE_L, breaks=seq(0,1,0.01), labels=seq(0.01,1,0.01)), y=GD_SD) ) +
geom_boxplot(outlier.shape=NA) +
coord_cartesian(ylim=c(0,0.18)) +
scale_x_discrete(breaks=seq(0,1,0.1), labels=paste(100*seq(0,1,0.1),'%',sep='')) +
scale_y_continuous(expand=c(0,0)) +
facet_grid(~GENE) + theme_bw() +
labs(x='\noverlap between taxon pairs\n(% of sequence length)', y='std deviation in genetic distance\n')
ggsave(file=file.path(wdir, gsub('.fa','_GDSD_by_overlap.pdf',infile)), w=14, h=7)
ggplot( subset(tmp, GD_MEAN>0), aes(x=cut(DO/GENE_L, breaks=seq(0,1,0.01), labels=seq(0.01,1,0.01)), y=GD_SD/GD_MEAN) ) +
geom_boxplot(outlier.shape=NA) +
coord_cartesian(ylim=c(0,0.6)) +
scale_x_discrete(breaks=seq(0,1,0.1), labels=paste(100*seq(0,1,0.1),'%',sep='')) +
scale_y_continuous(expand=c(0,0)) +
facet_grid(~GENE) + theme_bw() +
labs(x='\noverlap between taxon pairs\n(% of sequence length)', y='coefficient of variation in genetic distance\nacross bootstrap alignments\n')
ggsave(file=file.path(wdir, gsub('.fa','_GDCOV_by_overlap.pdf',infile)), w=14, h=7)
}
##--------------------------------------------------------------------------------------------------------
## olli 27.06.11
treecomparison.gd.dev<- function()
{
require(ape)
require(data.table)
#
# get true trees
#
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim_internal/freeze_July15'
tfiles <- list.files(indir, pattern='newick$', full.names=TRUE)
tfiles <- data.table( FILE_T= tfiles[ grepl('SUBSTTREE', tfiles) | grepl('Vill_99', tfiles) | grepl('Vill.*DATEDTREE', tfiles) ] )
tfiles[, SC:= toupper(gsub('_SUBSTTREE|_DATEDTREE','',gsub('.newick','',basename(FILE_T))))]
tmp <- rbind( subset(tfiles, SC=='VILL_99_APR15'), subset(tfiles, SC=='VILL_99_APR15'), subset(tfiles, SC=='VILL_99_APR15') )
set(tmp, NULL, 'SC', c('150701_VILL_SCENARIO-C','150701_VILL_SCENARIO-D','150701_VILL_SCENARIO-E'))
tfiles <- rbind(tfiles, tmp)
tmp <- list.files(indir, pattern='newick$', full.names=TRUE)
tmp <- data.table( FILE_T= tmp[ grepl('Reg.*DATEDTREE', tmp) ] )
tmp[, SC:= toupper(gsub('_SUBSTTREE|_DATEDTREE','',gsub('.newick','',basename(FILE_T))))]
tfiles <- rbind(tfiles, tmp)
tfiles[, BRL_T:= 'time']
set(tfiles, tfiles[, which(grepl('REG',SC) & grepl('SUBST',FILE_T))], 'BRL_T', 'subst')
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/simulations'
tmp <- data.table(FASTA_FILE=list.files(indir, full.names=1, pattern='_TRAIN[0-9]+_SIMULATED.fa$'))
tmp[, SC:= toupper(gsub('_SIMULATED.fa','',basename(FASTA_FILE)))]
tfiles <- merge(tfiles, tmp, by='SC')
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/tree_mvr'
tmp <- data.table(MVR_FILE=list.files(indir, full.names=1, pattern='_SIMULATED_tps.rda$'))
tmp[, SC:= toupper(gsub('_SIMULATED_tps.rda','',basename(MVR_FILE)))]
tfiles <- merge(tfiles, tmp, by='SC', all.x=1)
ttrs <- lapply(tfiles[, FILE_T], function(x) read.tree(file=x) )
names(ttrs) <- tfiles[, SC]
tfiles[, IDX_T:=seq_along(ttrs)]
tfiles[, TAXAN_T:= sapply(ttrs, Ntip)]
#
# read true patristic distances from true tree
#
tbrl <- subset(tfiles, BRL_T=='subst')[, {
#IDX_T <- 1
ph <- ttrs[[IDX_T]]
tmp <- distTips(ph, seq_len(Ntip(ph)), method='patristic', useC=TRUE)
tmp <- as.matrix(tmp)
tmp[upper.tri(tmp, diag=TRUE)] <- NA_real_
tmp <- as.data.table(melt(tmp))
setnames(tmp, c('Var1','Var2','value'),c('TAXA1','TAXA2','PD_T'))
tmp <- subset(tmp, !is.na(PD_T))
tmp
}, by='IDX_T']
# z[, table(IDX_T)]
# 1 3 5 7 9
# 1279200 1279200 1279200 1279200 1279200
#
# read true raw genetic distances from sequences
#
tmp <- subset(tfiles, BRL_T=='subst')[, {
#FASTA_FILE<- "/Users/Oliver/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/simulations/150701_Regional_TRAIN2_SIMULATED.fa"
sq <- read.dna(FASTA_FILE, format='fa')
tmp <- dist.dna(sq, model='raw', as.matrix=TRUE, pairwise.deletion=TRUE)
tmp <- as.matrix(tmp)
tmp[upper.tri(tmp, diag=TRUE)] <- NA_real_
tmp <- as.data.table(melt(tmp))
setnames(tmp, c('Var1','Var2','value'),c('TAXA1','TAXA2','ALL_GD_RAW_T'))
tmp <- subset(tmp, !is.na(ALL_GD_RAW_T))
tmp
}, by='IDX_T']
tbrl <- merge(tbrl, tmp, by=c('IDX_T','TAXA1','TAXA2'), all.x=1)
set(tbrl, NULL, 'TAXA1', tbrl[, as.character(TAXA1)])
set(tbrl, NULL, 'TAXA2', tbrl[, as.character(TAXA2)])
#
# read genetic distances that I calculated previously
#
tmp <- unique(subset(tfiles, BRL_T=='subst' & !is.na(MVR_FILE), c(IDX_T, MVR_FILE)))
tmp <- lapply(seq_len(nrow(tmp)), function(i){
#MVR_FILE<- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/tree_mvr/150701_Regional_TRAIN2_SIMULATED_tps.rda'
MVR_FILE <- tmp[i, MVR_FILE]
load(MVR_FILE)
z <- subset(tp, REP==1, select=c(TAXA1, TAXA2, GENE, GD, DO, GD_MEAN, GD_SD))
z[, IDX_T:= tmp[i, IDX_T]]
z
})
tmp <- do.call('rbind',tmp)
tmp <- dcast.data.table(melt(tmp, id.vars=c('IDX_T','TAXA1','TAXA2','GENE')), IDX_T+TAXA1+TAXA2~variable+GENE, value.var='value')
merge(tbrl, tmp, by=c('IDX_T','TAXA1','TAXA2'))
subset(tbrl, IDX_T==3 & TAXA2=='IDPOP_100181|F|DOB_2000.1|2016.31')
subset(tbrl, IDX_T==3 & grepl('IDPOP_100181',TAXA2))
tmp[, table(IDX_T)]
tbrl <- merge(tbrl, tmp, by=c('IDX_T','TAXA1','TAXA2'), all=1)
tbrl <- subset(tbrl, !is.na(PD_T))
#
# calculate MSEs
#
# MSE from simple raw genetic distance approach
tmp <- subset(tbrl, !is.na(ALL_GD_RAW_T) & (is.na(GENE) | GENE=='gag+pol+env'))
mse <- tmp[, list( TYPE='ALL_GD_RAW_T', GENE='gag+pol+env', PAIR_N=length(PD_T), MSE=mean((PD_T-ALL_GD_RAW_T)*(PD_T-ALL_GD_RAW_T)) ), by='IDX_T']
# MSE from my previously calculated distances
tmp <- subset(tbrl, !is.na(GENE) & !is.na(GD))
tmp <- tmp[, list( TYPE='GD', PAIR_N=length(PD_T), MSE=mean((PD_T-GD)*(PD_T-GD)) ), by=c('IDX_T','GENE')]
mse <- rbind(mse, tmp, use.names=TRUE, fill=TRUE)
MSE_GD_MEAN=mean((PD_T-GD_MEAN)*(PD_T-GD_MEAN))
tmp <- tmp[, list(FULL_GAPS_P=mean(FULL_GAPS_P), GAG_GAPS_P=mean(GAG_GAPS_P), POL_GAPS_P=mean(POL_GAPS_P), ENV_GAPS_P=mean(ENV_GAPS_P)), by='SC']
tinfo <- merge(tinfo, tmp, all.x=1, by='SC')
}
##--------------------------------------------------------------------------------------------------------
## olli 27.06.11
treecomparison.bootstrap.gd.dev<- function()
{
require(ape)
require(data.table)
bsn <- 1e2
repn <- 10
batch.n <- 3200
batch.i <- 1
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/simulations'
infile <- '150701_Regional_TRAIN4_SIMULATED.fa'
outdir <- indir
outfile <- paste(gsub('.fa','',infile),'_GDS_BATCH',batch.i,'.rda',sep='')
seq <- read.dna(file.path(indir, infile), format='fa')
# for this first analysis, to see if the resulting trees are meaningful at all,
# use just gag, pol and full
seqi <- as.data.table(read.csv(file.path(indir, gsub('\\.fa','_gene.txt',infile)), header=0))
seqi[, GENE:= regmatches(V2, regexpr('[a-z]+', V2))]
seqi[, START:= as.integer(gsub('-','',regmatches(V2, regexpr('[0-9]+-', V2))))]
seqi[, END:= as.integer(gsub('-','',regmatches(V2, regexpr('-[0-9]+', V2))))]
seqi <- subset(seqi, select=c(GENE,START,END))
#seqi <- rbind(seqi, data.table(GENE='full', START=1L, END= seqi[, max(END)]))
tp <- as.data.table( t(combn(rownames(seq),2)) )
#tp <- as.data.table( t(combn(sample(50, 10),2)) ) #this won t work. we need the specific ordering of the IDs that is used to create the combinations
setnames(tp, c('V1','V2'), c('TAXA1','TAXA2'))
tmp <- as.data.table( t(combn(seq_len(nrow(seq)),2)) )
setnames(tmp, c('V1','V2'), c('ID1','ID2'))
tp <- cbind(tp, tmp)
tp[, IDX:= seq_len(nrow(tp))]
# subset by batch
tp[, BATCH:= ceiling(IDX/batch.n)]
if(!is.na(batch.i))
tp <- subset(tp, BATCH==batch.i)
tmp <- dcast.data.table(tp, TAXA1~TAXA2, value.var='IDX')
d <- cbind(NA_real_, as.matrix(tmp[, -1, with=FALSE]))
d <- rbind(d, NA_real_)
colnames(d)[1] <- setdiff( as.character(tmp[, TAXA1]), colnames(d) )
rownames(d) <- colnames(d)
diag(d) <- 0
# complete lower triangular from upper triangular and vice versa
tmp <- lower.tri(d) & is.na(d)
d[tmp] <- t(d)[tmp]
tmp <- upper.tri(d) & is.na(d)
d[tmp] <- t(d)[tmp]
#
subset(tps, is.na(GD))
stopifnot(ncol(d)==nrow(d))
cat('proportion of NA entries=',length(which(is.na(d))) / prod(dim(d)))
#for each pair, estimate: actual distance, mean distance, variance in distance:
# (do this pairwise because otherwise too computationally expensive
# by gene
tpi <- tp[, {
cat('IDX',IDX, round(IDX/nrow(tp),d=3))
#TAXA1 <- 'IDPOP_13649|M|DOB_1906.66|2011.23'; TAXA2<- 'IDPOP_27993|F|DOB_1961.29|1991.587'
#START <- 1; END<- 1473
#system.time({
df.gd <- seqi[, {
spc <- as.character(seq[c(TAXA1,TAXA2), START:END])
# use same seed across all bootstrap runs, ie running for every gene is the same as running for the full genome
# and running for every taxon pair is the same as running for the whole alignment
set.seed(42)
tmp <- as.data.table(expand.grid(REP=seq_len(repn), BS=seq_len(bsn+1)))
tmp <- tmp[, {
# take bootstrap sample (except if bsi==1)
# the bootstrap is relative to the gene region!
spcb <- copy(spc)
if(BS>1)
{
# note: bootstrap includes ? columns, which adds uncertainty when genetic distances are evaluated over the overlap columns
spcb<- spcb[, sample(ncol(spcb), replace=TRUE)]
}
spb <- as.DNAbin(spcb)
# overlap that is not '?'
do <- sum(apply( spcb!='?', 2, prod))
# count genetic distance on overlap region, ie count gaps '-' as well, on anything that is not '?'
dn <- as.numeric( dist.dna(spb, model='N', pairwise.deletion=TRUE) )
# add indels to differences, but not when the other sequence is '?'
tmp <- which( !apply(spcb=='?', 2, any) )
if(length(tmp))
dn <- dn + as.numeric(dist.dna(spb[, tmp], model='indel'))
# DN can be > 0 if DO is 0, because of indels
list(DN=dn, DO=do)
}, by=c('REP','BS')]
tmp
}, by='GENE']
# collect distances for genes
ans <- subset(df.gd, BS>1)[, list( GD_MEAN=mean(DN/DO), GD_SD=sd(DN/DO) ), by=c('GENE','REP')]
tmp <- subset(df.gd, BS==1)[, list( GD=ifelse(DO==0, NA_real_, DN/DO), DO=DO ), by=c('GENE','REP')]
ans <- merge(tmp, ans, by=c('GENE','REP'))
# calculate distances for full genome
tmp <- subset(df.gd, GENE%in%c('gag','pol','env'))[, list(GENE='gag+pol+env', GD=sum(DN)/sum(DO), DO=sum(DO)), by=c('REP','BS')]
tmp <- tmp[, list(GD= GD[BS==1], DO=DO[BS==1], GD_MEAN=mean(GD[BS>1]), GD_SD=sd(GD[BS>1])), by=c('GENE','REP')]
ans <- rbind(ans, tmp)
#})
ans
}, by=c('TAXA1','TAXA2')]
# save output to
save(tpi, file=file.path(outdir,outfile))
}
##--------------------------------------------------------------------------------------------------------
## olli 27.06.11
treecomparison.saturation<- function()
{
if.seqs <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/simulations/150701_Regional_TRAIN2_SIMULATED.fa'
if.tree <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/simulations_trees/150701_Regional_TRAIN2_SUBSTTREE.newick'
if.int <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/simulations_trees/150701_Regional_TRAIN2_SIMULATED_INTERNAL.R'
# last gag: 1440; pol length: 2844; env length: 2523
# all gene codons are complete, so we can count through
#seq <- read.dna(if.seqs, format='fa')
load(if.int)
tmp <- tolower(do.call('rbind',strsplit(df.seq[, GAG],'')))
rownames(tmp) <- df.seq[, LABEL]
df.seq.gag <- as.DNAbin(tmp)
tmp <- tolower(do.call('rbind',strsplit(df.seq[, POL],'')))
rownames(tmp) <- df.seq[, LABEL]
df.seq.pol <- as.DNAbin(tmp)
tmp <- tolower(do.call('rbind',strsplit(df.seq[, ENV],'')))
rownames(tmp) <- df.seq[, LABEL]
df.seq.env <- as.DNAbin(tmp)
seq <- cbind(df.seq.gag,df.seq.pol,df.seq.env)
seq <- seq[,1:1440]
#seq <- seq[,1441:4284]
ph <- read.tree(if.tree)
tmp <- cophenetic.phylo(ph)
ds <- as.data.table(melt(as.matrix(tmp), stringsAsFactors=FALSE))
setnames(ds, c('Var1','Var2','value'), c('TAXA1','TAXA2','PATRISTIC_T'))
seq1 <- seq[,seq.int(1,ncol(seq),3)]
seq2 <- seq[,seq.int(2,ncol(seq),3)]
seq3 <- seq[,seq.int(3,ncol(seq),3)]
tmp <- dist.dna(seq1, model='raw', pairwise.deletion=TRUE)
tmp <- as.data.table(melt(as.matrix(tmp), stringsAsFactors=FALSE))
setnames(tmp, c('Var1','Var2','value'), c('TAXA1','TAXA2','RAW1_T'))
ds <- merge(ds, tmp, by=c('TAXA1','TAXA2'))
tmp <- dist.dna(seq2, model='raw', pairwise.deletion=TRUE)
tmp <- as.data.table(melt(as.matrix(tmp), stringsAsFactors=FALSE))
setnames(tmp, c('Var1','Var2','value'), c('TAXA1','TAXA2','RAW2_T'))
ds <- merge(ds, tmp, by=c('TAXA1','TAXA2'))
tmp <- dist.dna(seq3, model='raw', pairwise.deletion=TRUE)
tmp <- as.data.table(melt(as.matrix(tmp), stringsAsFactors=FALSE))
setnames(tmp, c('Var1','Var2','value'), c('TAXA1','TAXA2','RAW3_T'))
ds <- merge(ds, tmp, by=c('TAXA1','TAXA2'))
set(ds, NULL, 'TAXA1', ds[, as.character(TAXA1)])
set(ds, NULL, 'TAXA2', ds[, as.character(TAXA2)])
ds <- subset(ds, TAXA1<TAXA2)
dsm1 <- lm(RAW1_T~PATRISTIC_T, data=ds)
dsm2 <- lm(RAW2_T~PATRISTIC_T, data=ds)
dsm3 <- lm(RAW3_T~PATRISTIC_T, data=ds)
# ... there is no real saturation in the third codon position in the model!
dsp <- melt(ds, id.vars=c('TAXA1','TAXA2','PATRISTIC_T'))
ggplot(dsp, aes(x=PATRISTIC_T, y=value)) +
geom_point() +
theme_bw() + theme() +
facet_grid(~variable) +
labs(x='true patristic distance\n(substitutions / site)\n', y='distance between sequences\n(substitutions / site)\n')
}
##--------------------------------------------------------------------------------------------------------
## olli 27.06.11
treecomparison.create.metadata<- function()
{
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim_internal/freeze_July15'
outdir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/simulations'
infile.prefix <- '150701_Regional_TRAIN1_'
load( file.path(indir, paste(infile.prefix,'SIMULATED_INTERNAL.R',sep='')) )
tmp <- subset( df.inds, !is.na(TIME_SEQ), select=c(IDPOP, GENDER, DOB, DOD, DIAG_T, DIAG_CD4, ART1_T, ART1_CD4, TIME_SEQ, RECENT_TR ) )
set(tmp, NULL, 'GENDER', tmp[,as.character(GENDER)])
tmp2 <- tmp[, which(is.na(DIAG_T) & TIME_SEQ<2000)]
cat(paste('\nSet patient variables to NA for archival seq, n=',length(tmp2)))
set(tmp, tmp2, c('DOB','DOD'), NA_real_)
set(tmp, tmp2, 'GENDER', NA_character_)
tmp2 <- tmp[, which(is.na(DIAG_T) & TIME_SEQ>=2000)]
cat(paste('\nSet patient variables to NA after 2000, n=',length(tmp2)))
print(tmp[tmp2,])
set(tmp, tmp2, c('DOB','DOD'), NA_real_)
set(tmp, tmp2, 'GENDER', NA_character_)
set(tmp, NULL, 'GENDER', tmp[,factor(GENDER)])
file <- paste(outdir, '/', infile.prefix, 'SIMULATED_metadata.csv', sep='')
cat(paste('\nwrite to file', file))
write.csv(tmp, file)
}
##--------------------------------------------------------------------------------------------------------
##
##--------------------------------------------------------------------------------------------------------
treecomparison.ana.160627.standardize.MSE<- function()
{
require(ggplot2)
require(data.table)
require(ape)
require(scales)
require(ggtree)
require(phangorn)
edir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/evaluation'
timetag <- '160627'
load(file.path(edir,'submitted_160713_07MSELSD.rda'))
sc <- copy(sclu.info)
#
tmp <- subset(tinfo, !is.na(IDCLU))[, list(CLU_N=CLU_N[1], MXGPS_CLU= max(GPS), MDGPS_CLU=median(GPS)), by=c('SC','IDCLU')]
sc <- merge(sc, tmp, by=c('SC','IDCLU'))
set(sc, NULL, 'MODEL', sc[, factor(MODEL, levels=c('V','R'),labels=c('Model: Village','Model: Regional'))])
set(sc, sc[, which(SC=="VILL_99_APR15")],'SC',"150701_VILL_SCENARIO-C")
set(sc, NULL, 'SC', sc[, factor(SC, levels=c("150701_REGIONAL_TRAIN1", "150701_REGIONAL_TRAIN2", "150701_REGIONAL_TRAIN3", "150701_REGIONAL_TRAIN4","150701_REGIONAL_TRAIN5","150701_VILL_SCENARIO-A","150701_VILL_SCENARIO-B","150701_VILL_SCENARIO-C","150701_VILL_SCENARIO-D","150701_VILL_SCENARIO-E"),
labels=c('sc 1','sc 2','sc 3','sc 4','sc 5','sc A','sc B','sc C','sc D','sc E'))])
set(sc, NULL, 'GAPS', sc[, factor(GAPS, levels=c('none','low','high'),labels=c('none','as for Botswana\nsequences','as for Uganda\nsequences'))])
set(sc, NULL, 'BEST', sc[, factor(BEST, levels=c('Y','N'),labels=c('best tree','replicate tree'))])
set(sc, NULL, 'GENE', sc[, factor(GENE, levels=c('GAG','POL','GAG+POL+ENV'),labels=c('gag','pol','gag+pol+env'))])
set(sc, NULL, 'TEAM', sc[, factor(TEAM, levels=sc[, sort(unique(TEAM))],labels=sc[, sort(unique(TEAM))])])
set(sc, NULL, 'EXT', sc[, factor(EXT, levels=c('~0pc','5pc'),labels=c('~ 0%/year','5%/year'))])
set(sc, NULL, 'ART', sc[, factor(ART, levels=c('none','fast'),labels=c('none','fast'))])
sc <- subset(sc, OTHER=='N')
require(gamlss)
ggplot(subset(sc, TEAM!='MetaPIGA' & CLU_N<100), aes(x=CLU_N, y=MSE, colour=GENE, pch=TEAM)) + geom_point() + facet_grid(~SC)
ggplot(subset(sc, TEAM!='MetaPIGA' & SC=='sc 1'), aes(x=CLU_N, y=MSE, colour=GENE, pch=TEAM)) + geom_point()
ggplot(subset(sc, TEAM!='MetaPIGA'), aes(x=CLU_N, y=MSE, colour=GENE, pch=TEAM)) + geom_point() + coord_cartesian(ylim=c(0,1e4), xlim=c(0,100)) + facet_grid(~SC)
#
# look reasonable to divive KC by CLU_N*(CLU_N-1)/2
#
kc.std.d <- subset(sc, TEAM!='MetaPIGA' & SC=='sc 1')
kc.std.m1 <- gamlss(KC~CLU_N-1, data=kc.std.d)
kc.std.m2 <- gamlss(KC~poly(CLU_N,2, raw=TRUE), data=kc.std.d) #this allows for a non-zero baseline, which gave much better fit
#gamlss(KC~CLU_N+I(CLU_N^2)-1, data=kc.std.d)
kc.std.m3 <- gamlss(KC~I(CLU_N*(CLU_N-1)/2), data=kc.std.d)
kc.std.m4 <- gamlss(KC~poly(CLU_N,4, raw=TRUE), data=kc.std.d)
#kc.std.m4 <- gamlss(KC~I(sqrt(CLU_N*(CLU_N-1)/2))-1, data=kc.std.d)
kc.std.da <- subset(sc, TEAM!='MetaPIGA' & SC%in%c('sc 1','sc 2','sc 4'))
tmp.m1 <- predict(kc.std.m1, data=kc.std.d, newdata=kc.std.da, type='response', se.fit=FALSE)
tmp.m2 <- predict(kc.std.m2, data=kc.std.d, newdata=kc.std.da,type='response', se.fit=FALSE)
tmp.m3 <- predict(kc.std.m3, data=kc.std.d, newdata=kc.std.da,type='response', se.fit=FALSE)
tmp.m4 <- predict(kc.std.m4, data=kc.std.d, newdata=kc.std.da,type='response', se.fit=FALSE)
kc.std.da[, KC.m1:=tmp.m1]
kc.std.da[, KC.m2:=tmp.m2]
kc.std.da[, KC.m3:=tmp.m3]
kc.std.da[, KC.m4:=tmp.m4]
kc.std.da <- melt(kc.std.da, measure.vars=c('KC.m1','KC.m2','KC.m3','KC.m4'))
set(kc.std.da, NULL, 'variable', kc.std.da[, factor(variable, levels=c('KC.m1','KC.m2','KC.m3','KC.m4'), labels=c('KC~CLU_N-1','KC~poly(CLU_N,2, raw=TRUE)','KC~I(CLU_N*(CLU_N-1)/2)','KC~poly(CLU_N,4, raw=TRUE)'))])
ggplot(kc.std.da, aes(x=CLU_N)) + geom_point(aes(y=KC,colour=GENE, pch=TEAM)) +
geom_line(aes(y=value, linetype=variable, group=variable)) +
#scale_linetype_manual(values=c('KC~CLU_N-1'='a','KC~poly(CLU_N,2, raw=TRUE)-1'='e','KC~I(CLU_N*(CLU_N-1)/2)-1'='f','KC~poly(CLU_N,4, raw=TRUE)-1'='j')) +
facet_grid(~SC)
ggsave(file.path(edir, paste(timetag,'_','dependence_KC_clustersize.pdf',sep='')), w=15, h=7)
}
##--------------------------------------------------------------------------------------------------------
##
##--------------------------------------------------------------------------------------------------------
treecomparison.ana.160627.standardize.KC<- function()
{
require(ggplot2)
require(data.table)
require(ape)
require(scales)
require(ggtree)
require(phangorn)
edir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/evaluation'
timetag <- '160713'
load(paste(edir,'/','submitted_160713_07MSELSD.rda',sep=''))
sc <- copy(sclu.info)
#
tmp <- subset(tinfo, !is.na(IDCLU))[, list(CLU_N=CLU_N[1], MXGPS_CLU= max(GPS), MDGPS_CLU=median(GPS)), by=c('SC','IDCLU')]
sc <- merge(sc, tmp, by=c('SC','IDCLU'))
set(sc, NULL, 'MODEL', sc[, factor(MODEL, levels=c('V','R'),labels=c('Model: Village','Model: Regional'))])
set(sc, sc[, which(SC=="VILL_99_APR15")],'SC',"150701_VILL_SCENARIO-C")
set(sc, NULL, 'SC', sc[, factor(SC, levels=c("150701_REGIONAL_TRAIN1", "150701_REGIONAL_TRAIN2", "150701_REGIONAL_TRAIN3", "150701_REGIONAL_TRAIN4","150701_REGIONAL_TRAIN5","150701_VILL_SCENARIO-A","150701_VILL_SCENARIO-B","150701_VILL_SCENARIO-C","150701_VILL_SCENARIO-D","150701_VILL_SCENARIO-E"),
labels=c('sc 1','sc 2','sc 3','sc 4','sc 5','sc A','sc B','sc C','sc D','sc E'))])
set(sc, NULL, 'GAPS', sc[, factor(GAPS, levels=c('none','low','high'),labels=c('none','as for Botswana\nsequences','as for Uganda\nsequences'))])
set(sc, NULL, 'BEST', sc[, factor(BEST, levels=c('Y','N'),labels=c('best tree','replicate tree'))])
set(sc, NULL, 'GENE', sc[, factor(GENE, levels=c('GAG','POL','GAG+POL+ENV'),labels=c('gag','pol','gag+pol+env'))])
set(sc, NULL, 'TEAM', sc[, factor(TEAM, levels=sc[, sort(unique(TEAM))],labels=sc[, sort(unique(TEAM))])])
set(sc, NULL, 'EXT', sc[, factor(EXT, levels=c('~0pc','5pc'),labels=c('~ 0%/year','5%/year'))])
set(sc, NULL, 'ART', sc[, factor(ART, levels=c('none','fast'),labels=c('none','fast'))])
sc <- subset(sc, OTHER=='N')
require(gamlss)
ggplot(subset(sc, TEAM!='MetaPIGA' & CLU_N<100), aes(x=CLU_N, y=KC, colour=GENE, pch=TEAM)) + geom_point() + facet_grid(~SC)
ggplot(subset(sc, TEAM!='MetaPIGA' & SC=='sc 1'), aes(x=CLU_N, y=KC, colour=GENE, pch=TEAM)) + geom_point()
#
# look reasonable to divive KC by CLU_N*(CLU_N-1)/2
#
kc.std.d <- subset(sc, TEAM!='MetaPIGA' & SC=='sc 1', select=c(SC,TEAM,GENE,CLU_N, KC))
kc.std.m1 <- gamlss(KC~CLU_N-1, data=kc.std.d)
kc.std.m2 <- gamlss(KC~poly(CLU_N,2, raw=TRUE), data=kc.std.d) #this allows for a non-zero baseline, which gave much better fit
#gamlss(KC~CLU_N+I(CLU_N^2)-1, data=kc.std.d)
kc.std.m3 <- gamlss(KC~I(CLU_N*(CLU_N-1)/2), data=kc.std.d)
kc.std.m4 <- gamlss(KC~poly(CLU_N,4, raw=TRUE), data=kc.std.d)
#kc.std.m4 <- gamlss(KC~I(sqrt(CLU_N*(CLU_N-1)/2))-1, data=kc.std.d)
kc.std.da <- subset(sc, !TEAM%in%c('MetaPIGA','MVR','BioNJ') & SC%in%c('sc 1','sc 2','sc 4'), select=c(SC,TEAM,GENE,CLU_N, KC))
tmp.m1 <- predict(kc.std.m1, data=kc.std.d, newdata=kc.std.da, type='response', se.fit=FALSE)
tmp.m2 <- predict(kc.std.m2, data=kc.std.d, newdata=kc.std.da,type='response', se.fit=FALSE)
tmp.m3 <- predict(kc.std.m3, data=kc.std.d, newdata=kc.std.da,type='response', se.fit=FALSE)
tmp.m4 <- predict(kc.std.m4, data=kc.std.d, newdata=kc.std.da,type='response', se.fit=FALSE)
kc.std.da[, KC.m1:=tmp.m1]
kc.std.da[, KC.m2:=tmp.m2]
kc.std.da[, KC.m3:=tmp.m3]
kc.std.da[, KC.m4:=tmp.m4]
kc.std.da <- melt(kc.std.da, measure.vars=c('KC.m1','KC.m2','KC.m3','KC.m4'))
set(kc.std.da, NULL, 'variable', kc.std.da[, factor(variable, levels=c('KC.m1','KC.m2','KC.m3','KC.m4'), labels=c('KC~CLU_N-1','KC~poly(CLU_N,2, raw=TRUE)','KC~I(CLU_N*(CLU_N-1)/2)','KC~poly(CLU_N,4, raw=TRUE)'))])
ggplot(kc.std.da, aes(x=CLU_N)) + geom_point(aes(y=KC,colour=GENE, pch=TEAM)) +
geom_line(aes(y=value, linetype=variable, group=variable)) +
#scale_linetype_manual(values=c('KC~CLU_N-1'='a','KC~poly(CLU_N,2, raw=TRUE)-1'='e','KC~I(CLU_N*(CLU_N-1)/2)-1'='f','KC~poly(CLU_N,4, raw=TRUE)-1'='j')) +
facet_grid(~SC)
ggsave(file.path(edir, paste(timetag,'_','dependence_KC_clustersize.pdf',sep='')), w=15, h=7)
}
##--------------------------------------------------------------------------------------------------------
## olli 27.06.16
##--------------------------------------------------------------------------------------------------------
treecomparison.submissions.160713<- function()
{
load(paste(edir,'/','submitted_160713_QD.rda',sep=''))
tmp <- subset(submitted.info, select=c(IDX, TAXA_NJ, NQD))
tmp2 <- subset(sclu.info, select=c(IDX, IDCLU, NQDC))
load(paste(edir,'/','submitted_160713.rda',sep=''))
submitted.info <- merge(submitted.info, tmp, by='IDX')
sclu.info <- merge(sclu.info, tmp2, by=c('IDX','IDCLU'))
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/simulations'
infiles <- data.table(FILE=list.files(indir, full.names=1, pattern='_TRAIN[0-9]+_SIMULATED.fa$'))
infiles[, SC:= toupper(gsub('_SIMULATED.fa','',basename(FILE)))]
tmp <- infiles[, {
#FILE<- "/Users/Oliver/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/simulations/150701_Regional_TRAIN4_SIMULATED.fa"
sq <- read.dna(FILE, format='fa')
seqi<- as.data.table(read.csv(gsub('.fa','_gene.txt',FILE), header=FALSE))
seqi[, GENE:= regmatches(V2, regexpr('[a-z]+', V2))]
seqi[, START:= as.integer(gsub('-','',regmatches(V2, regexpr('[0-9]+-', V2))))]
seqi[, END:= as.integer(gsub('-','',regmatches(V2, regexpr('-[0-9]+', V2))))]
seqi <- subset(seqi, select=c(GENE,START,END))
seqi <- rbind(seqi, data.table(GENE='full', START=1L, END= seqi[, max(END)]))
ans <- seqi[, {
#START<- 1474; END<- 5466
z <- as.character(sq[,START:END])
tmp <- apply(z, 2, function(x) all(x%in%c('-','?')))
z <- z[, !tmp]
tmp <- apply(z, 1, function(x) length(which(x=='?'))) / ncol(z)
list(TAXA=names(tmp), GAPS_P=tmp)
}, by='GENE']
set(ans, NULL, 'GENE', ans[, paste(toupper(GENE),'_GAPS_P',sep='')])
ans <- dcast.data.table(ans, TAXA~GENE, value.var='GAPS_P')
ans
}, by='SC']
tmp[, list(FULL_GAPS_P=mean(FULL_GAPS_P), GAG_GAPS_P=mean(GAG_GAPS_P), POL_GAPS_P=mean(POL_GAPS_P), ENV_GAPS_P=mean(ENV_GAPS_P)), by='SC']
tinfo <- merge(tinfo, tmp, by=c('SC','TAXA'), all.x=1)
save(strs, strs_rtt, ttrs, tinfo, tfiles, tinfo.pairs, submitted.info, sclu.info, lba, file=file.path(edir, 'submitted_160713_RFPDQDTP.rda'))
}
##--------------------------------------------------------------------------------------------------------
## olli 27.06.16
##--------------------------------------------------------------------------------------------------------
treecomparison.combine.stuffoncluster.161123<- function()
{
require(ape)
require(data.table)
indir <- '~/duke/tmp/tc'
indir <- '/work/or105/Gates_2014/tree_comparison'
infiles <- list.files(indir, pattern='hpc[0-9]+_09SBRL.rda$',full.names=TRUE)
cat('\n use as first batch', infiles[1])
load( infiles[1] )
# load first batch
# "strs", "strs_rtt", "strs_lsd", "ttrs", "trungps", "tinfo", "tfiles", "submitted.info", "sclu.info", "lba"
submitted.info.all <- copy(submitted.info)
sclu.info.all <- copy(sclu.info)
lba.all <- copy(lba)
strs_rtt.all <- copy(strs_rtt)
# add next batches
infiles <- infiles[-1]
for( i in seq_along(infiles))
{
#i <- 2
cat('\n add next batch', infiles[i])
load( infiles[i] )
submitted.info.all <- rbind(submitted.info.all, submitted.info)
sclu.info.all <- rbind(sclu.info.all, sclu.info)
lba.all <- rbind(lba.all, lba)
tmp <- unname(which(!sapply(strs_rtt, is.null)))
for(j in tmp)
strs_rtt.all[[j]] <- strs_rtt[[j]]
gc()
}
# save
submitted.info <- copy(submitted.info.all)
sclu.info <- copy(sclu.info.all)
lba <- copy(lba.all)
strs_rtt <- copy(strs_rtt.all)
tmp <- gsub('_hpc[0-9]+','',infiles[i])
cat('\nsave combined output to', tmp)
save(strs,strs_rtt,strs_lsd,ttrs,trungps,tinfo,tfiles,submitted.info,sclu.info,lba, file=tmp)
}
##--------------------------------------------------------------------------------------------------------
## olli 27.06.16
##--------------------------------------------------------------------------------------------------------
treecomparison.submissions.161123<- function()
{
require(data.table)
require(ape)
require(adephylo)
require(phangorn)
require(parallel)
#
# get true trees
#
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/simulations_trees'
tfiles <- list.files(indir, pattern='newick$', full.names=TRUE)
tfiles <- data.table( FILE_T= tfiles[ grepl('SUBSTTREE', tfiles) | grepl('Vill_99', tfiles) | grepl('.*DATEDTREE', tfiles) ] )
tfiles[, SC:= toupper(gsub('_SUBSTTREE|_DATEDTREE','',gsub('.newick','',basename(FILE_T))))]
tmp <- rbind( subset(tfiles, SC=='VILL_99_APR15'), subset(tfiles, SC=='VILL_99_APR15'), subset(tfiles, SC=='VILL_99_APR15') )
set(tmp, NULL, 'SC', c('150701_VILL_SCENARIO-C','150701_VILL_SCENARIO-D','150701_VILL_SCENARIO-E'))
tfiles <- rbind(tfiles, tmp)
#tmp <- list.files(indir, pattern='newick$', full.names=TRUE)
#tmp <- data.table( FILE_T= tmp[ grepl('*DATEDTREE', tmp) ] )
#tmp[, SC:= toupper(gsub('_SUBSTTREE|_DATEDTREE','',gsub('.newick','',basename(FILE_T))))]
#tfiles <- rbind(tfiles, tmp)
tfiles[, BRL_T:= 'time']
set(tfiles, NULL, 'SC', tfiles[, gsub('161121_GTR','161121_REGIONAL_GTR',SC)])
set(tfiles, tfiles[, which(grepl('REG',SC) & grepl('SUBST',FILE_T))], 'BRL_T', 'subst')
ttrs <- lapply(tfiles[, FILE_T], function(x) read.tree(file=x) )
names(ttrs) <- tfiles[, SC]
for(z in c('VILL_99_APR15','150701_VILL_SCENARIO-C','150701_VILL_SCENARIO-D','150701_VILL_SCENARIO-E'))
ttrs[[z]] <- root(ttrs[[z]], node=Ntip(ttrs[[z]])+2, resolve.root=1)
tfiles[, IDX_T:=seq_along(ttrs)]
tfiles[, TAXAN_T:= sapply(ttrs, Ntip)]
# patristic distances on true trees (by time and subst/site)
tbrl <- NULL
# info on true trees
tinfo <- merge(tfiles, do.call('rbind',lapply(seq_along(ttrs), function(i) data.table(TAXA=ttrs[[i]]$tip.label, IDX_T=i))), by='IDX_T')
tinfo[, IDPOP:=NA_character_]
tmp <- tinfo[, which(grepl('REGIONAL',SC))]
set(tinfo, tmp, 'IDPOP', tinfo[tmp,regmatches(TAXA, regexpr('IDPOP_[0-9]+',TAXA))])
tmp <- tinfo[, which(!grepl('REGIONAL',SC))]
set(tinfo, tmp, 'IDPOP', tinfo[tmp, regmatches(TAXA, regexpr('HOUSE[0-9]+-[0-9]+|House[0-9]+-[0-9]+',TAXA))])
stopifnot(subset(tinfo, grepl('VILL',SC))[, length(which(substring(TAXA,1,10)!=substring(IDPOP,1,10)))]==0)
stopifnot( tinfo[, length(which(is.na(IDPOP)))==0] )
set(tinfo, NULL, 'IDPOP', tinfo[,toupper(IDPOP)])
set(tinfo, NULL, 'TAXA', tinfo[,toupper(TAXA)])
# read cluster membership from DATEDCLUTREES
tmp <- list.files(indir, pattern='DATEDCLUTREES', full.names=TRUE)
tmp <- rbind( data.table( FILE_CLU_T= tmp,
SC= gsub('161121_GTR','161121_REGIONAL_GTR',toupper(gsub('_DATEDCLUTREES','',gsub('.newick','',basename(tmp))))),
BRL_T= 'time'),
data.table( FILE_CLU_T= tmp,
SC= gsub('161121_GTR','161121_REGIONAL_GTR',toupper(gsub('_DATEDCLUTREES','',gsub('.newick','',basename(tmp))))),
BRL_T= 'subst') )
tfiles <- merge(tfiles, tmp, by=c('SC','BRL_T'), all=1)
tmp <- subset(tfiles, !is.na(FILE_CLU_T))[, {
z <- read.tree(FILE_CLU_T)
do.call('rbind',lapply(seq_along(z), function(i) data.table(IDCLU=i, TAXA=z[[i]]$tip.label)))
}, by=c('SC','BRL_T')]
tinfo <- merge(tinfo, tmp, by=c('SC','BRL_T','TAXA'), all=1)
tmp <- subset(tinfo, !is.na(IDCLU))[, list(CLU_N= length(IDPOP)), by=c('SC','BRL_T','IDCLU')]
tinfo <- merge(tinfo, tmp, by=c('SC','BRL_T','IDCLU'), all=1)
# read sequences and determine %gappiness in full alignment
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/running_gaps_simulations2'
tmp <- list.files(indir, pattern='fa$|fasta$', full.names=TRUE)
trungps <- data.table( FILE_SEQ_T= tmp,
TEAM= 'RUNGAPS_EXCLTAXA',
SC='150701_Regional_TRAIN2',
GENE='FULL',
RUNGAPS= as.numeric(gsub('TRAIN[0-9]([0-9][0-9]).*','\\1',regmatches(tmp,regexpr('TRAIN[0-9]+',tmp))))/100,
RUNGAPS_EXCL= as.numeric(gsub('TRAIN[0-9][0-9][0-9]([0-9][0-9]).*','\\1',regmatches(tmp,regexpr('TRAIN[0-9]+',tmp))))/100
)
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/running_gaps_simulations3'
tmp <- list.files(indir, pattern='fa$|fasta$', full.names=TRUE)
tmp <- tmp[!grepl('WORST',tmp)]
tmp <- data.table( FILE_SEQ_T= tmp,
TEAM= 'RUNGAPS_EXCLSITE',
SC='150701_Regional_TRAIN2',
GENE=regmatches(tmp,regexpr('FULL|GAG|GAGPP|P17|FULL',tmp)),
RUNGAPS= as.numeric(gsub('TRAIN[0-9]([0-9][0-9]).*','\\1',regmatches(tmp,regexpr('TRAIN[0-9]+',tmp))))/100,
RUNGAPS_EXCL= as.numeric(gsub('EXCLSITES','',regmatches(tmp,regexpr('EXCLSITES[0-9]+',tmp))))/100
)
trungps <- rbind(trungps, tmp)
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/running_gaps_simulations'
tmp <- list.files(indir, pattern='fa$|fasta$', full.names=TRUE)
tmp <- data.table( FILE_SEQ_T= tmp,
TEAM= 'RUNGAPS_ExaML',
SC=gsub('(150701_Regional_TRAIN[0-9]).*','\\1',basename(tmp)),
GENE=regmatches(tmp,regexpr('FULL|GAG|GAGPP|P17|FULL',tmp)),
RUNGAPS= as.numeric(gsub('TRAIN[0-9]([0-9][0-9]).*','\\1',regmatches(tmp,regexpr('TRAIN[0-9]+',tmp))))/100,
RUNGAPS_EXCL= 1
)
trungps <- rbind(trungps, tmp)
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/simulations'
tmp <- list.files(indir, full.names=1, pattern='_TRAIN[0-9]+_SIMULATED.fa$|GTRFIXED.*_SIMULATED.fasta$')
tmp <- data.table( FILE_SEQ_T= tmp,
TEAM= NA_real_,
SC=gsub('161121_','161121_REGIONAL_',toupper(gsub('_SIMULATED.fa|_SIMULATED.fasta','',basename(tmp)))),
GENE=gsub('\\.fa$','FULL',regmatches(tmp,regexpr('\\.fa$|FULL|GAG|GAGPP|P17|FULL',tmp))),
RUNGAPS= NA_real_,
RUNGAPS_EXCL= NA_real_
)
trungps <- rbind(trungps, tmp)
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/running_gaps_simulations4'
tmp <- list.files(indir, full.names=1, pattern='_TRAIN[0-9]+_FULL_SIMULATED.fa$')
tmp <- data.table( FILE_SEQ_T= tmp,
TEAM= 'RUNGAPS_ExaML',
SC=toupper(gsub('[0-9][0-9]_FULL_SIMULATED.fa|_SIMULATED.fasta','',basename(tmp))),
GENE=gsub('\\.fa$','FULL',regmatches(tmp,regexpr('\\.fa$|FULL|GAG|GAGPP|P17|FULL',tmp))),
RUNGAPS= as.numeric(gsub('TRAIN[0-9]([0-9][0-9]).*','\\1',regmatches(tmp,regexpr('TRAIN[0-9]+',tmp))))/100,
RUNGAPS_EXCL= 1
)
trungps <- rbind(trungps, tmp)
#
trungps <- trungps[, {
# FILE_SEQ_T<- '/Users/Oliver/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/running_gaps_simulations2/150701_Regional_TRAIN20250_FULL_SIMULATED.fa'
cat('\n',FILE_SEQ_T)
z <- read.dna(FILE_SEQ_T, format='fasta')
ans <- sapply(seq_len(nrow(z)), function(i) base.freq(z[i,], all=1))
list( TAXA=rownames(z),
ACTG_P=apply(ans[c('a','c','t','g'),], 2, sum),
UNASS_P=ans['?',],
NCOL=ncol(z))
}, by=c('FILE_SEQ_T','SC','TEAM','GENE','RUNGAPS','RUNGAPS_EXCL')]
trungps <- trungps[, list(ACTG_P=mean(ACTG_P), UNASS_P=mean(UNASS_P), SITES_N=NCOL[1]), by=c('FILE_SEQ_T','SC','TEAM','GENE','RUNGAPS','RUNGAPS_EXCL')]
#
# to tinfo add actual transmitters
#
# check TRAIN1
load( '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/simulations_trees/150701_Regional_TRAIN1_SIMULATED_INTERNAL.R' )
ch <- subset(tinfo, SC=='150701_REGIONAL_TRAIN1' & BRL_T=='time', TAXA)
ch[, IDPOP:= as.integer(gsub('IDPOP_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',1)))]
ch[, GENDER_CH:= sapply(strsplit(TAXA,'|',fixed=1),'[[',2)]
ch[, DOB_CH:= as.numeric(gsub('DOB_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',3)))]
ch[, TIME_SEQ_CH:= as.numeric(sapply(strsplit(TAXA,'|',fixed=1),'[[',4))]
ch <- merge(subset(df.inds, select=c(IDPOP, GENDER, DOB, TIME_SEQ)), ch, by='IDPOP')
stopifnot( ch[, all(abs(DOB-DOB_CH)<=2*.Machine$double.eps)], ch[, all(GENDER==GENDER_CH)], ch[, all(abs(TIME_SEQ-TIME_SEQ_CH)<=0.001)] )
# OK :-) schedule adding IDPOP_T
df.trms[, IDPOP:= df.trms[, paste('IDPOP_',IDREC,sep='')]]
tinfo.add <- subset(df.trms, select=c(IDPOP, IDTR))
df.trms[, IDPOP:= df.trms[, paste('IDPOP_',IDTR,sep='')]]
tinfo.add <- merge(tinfo.add, subset(df.trms, select=c(IDPOP, IDREC)), by='IDPOP', all=1)
set(ch, NULL, 'IDPOP', ch[, paste('IDPOP_',IDPOP,sep='')])
tinfo.add <- merge(tinfo.add, subset(ch, select=IDPOP), by='IDPOP')
tinfo.add[, SC:='150701_REGIONAL_TRAIN1']
# check TRAIN2
load( '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/simulations_trees/150701_Regional_TRAIN2_SIMULATED_INTERNAL.R' )
ch <- subset(tinfo, SC=='150701_REGIONAL_TRAIN2' & BRL_T=='time', TAXA)
ch[, IDPOP:= as.integer(gsub('IDPOP_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',1)))]
ch[, GENDER_CH:= sapply(strsplit(TAXA,'|',fixed=1),'[[',2)]
ch[, DOB_CH:= as.numeric(gsub('DOB_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',3)))]
ch[, TIME_SEQ_CH:= as.numeric(sapply(strsplit(TAXA,'|',fixed=1),'[[',4))]
ch <- merge(subset(df.inds, select=c(IDPOP, GENDER, DOB, TIME_SEQ)), ch, by='IDPOP')
stopifnot( ch[, all(abs(DOB-DOB_CH)<=2*.Machine$double.eps)], ch[, all(GENDER==GENDER_CH)], ch[, all(abs(TIME_SEQ-TIME_SEQ_CH)<=0.001)] )
# OK :-) schedule adding IDPOP_T
df.trms[, IDPOP:= df.trms[, paste('IDPOP_',IDREC,sep='')]]
tmp <- subset(df.trms, select=c(IDPOP, IDTR))
df.trms[, IDPOP:= df.trms[, paste('IDPOP_',IDTR,sep='')]]
tmp <- merge(tmp, subset(df.trms, select=c(IDPOP, IDREC)), by='IDPOP', all=1)
set(ch, NULL, 'IDPOP', ch[, paste('IDPOP_',IDPOP,sep='')])
tmp <- merge(tmp, subset(ch, select=IDPOP), by='IDPOP')
tmp[, SC:='150701_REGIONAL_TRAIN2']
tinfo.add <- rbind(tinfo.add, tmp)
# check TRAIN4
ch <- subset(tinfo, SC=='150701_REGIONAL_TRAIN4' & BRL_T=='time', TAXA)
ch[, IDPOP:= as.integer(gsub('IDPOP_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',1)))]
ch[, GENDER_CH:= sapply(strsplit(TAXA,'|',fixed=1),'[[',2)]
ch[, DOB_CH:= as.numeric(gsub('DOB_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',3)))]
ch[, TIME_SEQ_CH:= as.numeric(sapply(strsplit(TAXA,'|',fixed=1),'[[',4))]
ch <- merge(subset(df.inds, select=c(IDPOP, GENDER, DOB, TIME_SEQ)), ch, by='IDPOP')
stopifnot( ch[, all(abs(DOB-DOB_CH)<=2*.Machine$double.eps)], ch[, all(GENDER==GENDER_CH)], ch[, all(abs(TIME_SEQ-TIME_SEQ_CH)<=0.001)] )
tmp[, SC:='150701_REGIONAL_TRAIN4']
tinfo.add <- rbind(tinfo.add, tmp)
# check TRAIN3
load( '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/simulations_trees/150701_Regional_TRAIN3_SIMULATED_INTERNAL.R' )
ch <- subset(tinfo, SC=='150701_REGIONAL_TRAIN3' & BRL_T=='time', TAXA)
ch[, IDPOP:= as.integer(gsub('IDPOP_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',1)))]
ch[, GENDER_CH:= sapply(strsplit(TAXA,'|',fixed=1),'[[',2)]
ch[, DOB_CH:= as.numeric(gsub('DOB_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',3)))]
ch[, TIME_SEQ_CH:= as.numeric(sapply(strsplit(TAXA,'|',fixed=1),'[[',4))]
ch <- merge(subset(df.inds, select=c(IDPOP, GENDER, DOB, TIME_SEQ)), ch, by='IDPOP')
stopifnot( ch[, all(abs(DOB-DOB_CH)<=2*.Machine$double.eps)], ch[, all(GENDER==GENDER_CH)], ch[, all(abs(TIME_SEQ-TIME_SEQ_CH)<=0.001)] )
# OK :-) schedule adding IDPOP_T
df.trms[, IDPOP:= df.trms[, paste('IDPOP_',IDREC,sep='')]]
tmp <- subset(df.trms, select=c(IDPOP, IDTR))
df.trms[, IDPOP:= df.trms[, paste('IDPOP_',IDTR,sep='')]]
tmp <- merge(tmp, subset(df.trms, select=c(IDPOP, IDREC)), by='IDPOP', all=1)
set(ch, NULL, 'IDPOP', ch[, paste('IDPOP_',IDPOP,sep='')])
tmp <- merge(tmp, subset(ch, select=IDPOP), by='IDPOP')
tmp[, SC:='150701_REGIONAL_TRAIN3']
tinfo.add <- rbind(tinfo.add, tmp)
# check TRAIN5
ch <- subset(tinfo, SC=='150701_REGIONAL_TRAIN5' & BRL_T=='time', TAXA)
ch[, IDPOP:= as.integer(gsub('IDPOP_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',1)))]
ch[, GENDER_CH:= sapply(strsplit(TAXA,'|',fixed=1),'[[',2)]
ch[, DOB_CH:= as.numeric(gsub('DOB_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',3)))]
ch[, TIME_SEQ_CH:= as.numeric(sapply(strsplit(TAXA,'|',fixed=1),'[[',4))]
ch <- merge(subset(df.inds, select=c(IDPOP, GENDER, DOB, TIME_SEQ)), ch, by='IDPOP')
stopifnot( ch[, all(abs(DOB-DOB_CH)<=2*.Machine$double.eps)], ch[, all(GENDER==GENDER_CH)], ch[, all(abs(TIME_SEQ-TIME_SEQ_CH)<=0.001)] )
tmp[, SC:='150701_REGIONAL_TRAIN5']
tinfo.add <- rbind(tinfo.add, tmp)
# check TRAIN6
load( '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/simulations_trees/161121_Regional_TRAIN6_SIMULATED_INTERNAL.R' )
ch <- subset(tinfo, SC=='161121_REGIONAL_TRAIN6' & BRL_T=='time', TAXA)
ch[, IDPOP:= as.integer(gsub('IDPOP_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',1)))]
ch[, GENDER_CH:= sapply(strsplit(TAXA,'|',fixed=1),'[[',2)]
ch[, DOB_CH:= as.numeric(gsub('DOB_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',3)))]
ch[, TIME_SEQ_CH:= as.numeric(sapply(strsplit(TAXA,'|',fixed=1),'[[',4))]
ch <- merge(subset(df.inds, select=c(IDPOP, GENDER, DOB, TIME_SEQ)), ch, by='IDPOP')
stopifnot( ch[, all(abs(DOB-DOB_CH)<=2*.Machine$double.eps)], ch[, all(GENDER==GENDER_CH)], ch[, all(abs(TIME_SEQ-TIME_SEQ_CH)<=0.001)] )
# OK :-) schedule adding IDPOP_T
df.trms[, IDPOP:= df.trms[, paste('IDPOP_',IDREC,sep='')]]
tmp <- subset(df.trms, select=c(IDPOP, IDTR))
df.trms[, IDPOP:= df.trms[, paste('IDPOP_',IDTR,sep='')]]
tmp <- merge(tmp, subset(df.trms, select=c(IDPOP, IDREC)), by='IDPOP', all=1)
set(ch, NULL, 'IDPOP', ch[, paste('IDPOP_',IDPOP,sep='')])
tmp <- merge(tmp, subset(ch, select=IDPOP), by='IDPOP')
tmp[, SC:='161121_REGIONAL_TRAIN6']
tinfo.add <- rbind(tinfo.add, tmp)
# check GTRFIXED2
load( '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/simulations_trees/161121_GTRFIXED2_SIMULATED_INTERNAL.R' )
ch <- subset(tinfo, SC=='161121_REGIONAL_GTRFIXED2' & BRL_T=='time', TAXA)
ch[, IDPOP:= as.integer(gsub('IDPOP_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',1)))]
ch[, GENDER_CH:= sapply(strsplit(TAXA,'|',fixed=1),'[[',2)]
ch[, DOB_CH:= as.numeric(gsub('DOB_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',3)))]
ch[, TIME_SEQ_CH:= as.numeric(sapply(strsplit(TAXA,'|',fixed=1),'[[',4))]
ch <- merge(subset(df.inds, select=c(IDPOP, GENDER, DOB, TIME_SEQ)), ch, by='IDPOP')
stopifnot( ch[, all(abs(DOB-DOB_CH)<=2*.Machine$double.eps)], ch[, all(GENDER==GENDER_CH)], ch[, all(abs(TIME_SEQ-TIME_SEQ_CH)<=0.001)] )
# OK :-) schedule adding IDPOP_T
df.trms[, IDPOP:= df.trms[, paste('IDPOP_',IDREC,sep='')]]
tmp <- subset(df.trms, select=c(IDPOP, IDTR))
df.trms[, IDPOP:= df.trms[, paste('IDPOP_',IDTR,sep='')]]
tmp <- merge(tmp, subset(df.trms, select=c(IDPOP, IDREC)), by='IDPOP', all=1)
set(ch, NULL, 'IDPOP', ch[, paste('IDPOP_',IDPOP,sep='')])
tmp <- merge(tmp, subset(ch, select=IDPOP), by='IDPOP')
tmp[, SC:='161121_REGIONAL_GTRFIXED2']
tinfo.add <- rbind(tinfo.add, tmp)
# check GTRFIXED3
load( '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/simulations_trees/161121_GTRFIXED3_SIMULATED_INTERNAL.R' )
ch <- subset(tinfo, SC=='161121_REGIONAL_GTRFIXED3' & BRL_T=='time', TAXA)
ch[, IDPOP:= as.integer(gsub('IDPOP_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',1)))]
ch[, GENDER_CH:= sapply(strsplit(TAXA,'|',fixed=1),'[[',2)]
ch[, DOB_CH:= as.numeric(gsub('DOB_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',3)))]
ch[, TIME_SEQ_CH:= as.numeric(sapply(strsplit(TAXA,'|',fixed=1),'[[',4))]
ch <- merge(subset(df.inds, select=c(IDPOP, GENDER, DOB, TIME_SEQ)), ch, by='IDPOP')
stopifnot( ch[, all(abs(DOB-DOB_CH)<=2*.Machine$double.eps)], ch[, all(GENDER==GENDER_CH)], ch[, all(abs(TIME_SEQ-TIME_SEQ_CH)<=0.001)] )
# OK :-) schedule adding IDPOP_T
df.trms[, IDPOP:= df.trms[, paste('IDPOP_',IDREC,sep='')]]
tmp <- subset(df.trms, select=c(IDPOP, IDTR))
df.trms[, IDPOP:= df.trms[, paste('IDPOP_',IDTR,sep='')]]
tmp <- merge(tmp, subset(df.trms, select=c(IDPOP, IDREC)), by='IDPOP', all=1)
set(ch, NULL, 'IDPOP', ch[, paste('IDPOP_',IDPOP,sep='')])
tmp <- merge(tmp, subset(ch, select=IDPOP), by='IDPOP')
tmp[, SC:='161121_REGIONAL_GTRFIXED3']
tinfo.add <- rbind(tinfo.add, tmp)
# check GTRFIXED1
load( '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/simulations_trees/161121_GTRFIXED1_SIMULATED_INTERNAL.R' )
ch <- subset(tinfo, SC=='161121_REGIONAL_GTRFIXED1' & BRL_T=='time', TAXA)
ch[, IDPOP:= as.integer(gsub('IDPOP_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',1)))]
ch[, GENDER_CH:= sapply(strsplit(TAXA,'|',fixed=1),'[[',2)]
ch[, DOB_CH:= as.numeric(gsub('DOB_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',3)))]
ch[, TIME_SEQ_CH:= as.numeric(sapply(strsplit(TAXA,'|',fixed=1),'[[',4))]
ch <- merge(subset(df.inds, select=c(IDPOP, GENDER, DOB, TIME_SEQ)), ch, by='IDPOP')
stopifnot( ch[, all(abs(DOB-DOB_CH)<=2*.Machine$double.eps)], ch[, all(GENDER==GENDER_CH)], ch[, all(abs(TIME_SEQ-TIME_SEQ_CH)<=0.001)] )
# OK :-) schedule adding IDPOP_T
df.trms[, IDPOP:= df.trms[, paste('IDPOP_',IDREC,sep='')]]
tmp <- subset(df.trms, select=c(IDPOP, IDTR))
df.trms[, IDPOP:= df.trms[, paste('IDPOP_',IDTR,sep='')]]
tmp <- merge(tmp, subset(df.trms, select=c(IDPOP, IDREC)), by='IDPOP', all=1)
set(ch, NULL, 'IDPOP', ch[, paste('IDPOP_',IDPOP,sep='')])
tmp <- merge(tmp, subset(ch, select=IDPOP), by='IDPOP')
tmp[, SC:='161121_REGIONAL_GTRFIXED1']
tinfo.add <- rbind(tinfo.add, tmp)
#
# add transmitters for regional to tinfo
#
tinfo <- merge(tinfo, tinfo.add, by=c('IDPOP', 'SC'), all.x=1)
#
# add true node depths to tinfo
#
tmp <- tinfo[, {
cat(IDX_T,'\n')
ph<- ttrs[[IDX_T]]
list(DEPTH_T=node.depth.edgelength(ph)[seq_len(Ntip(ph))], TAXA=ph$tip.label)
}, by='IDX_T']
tinfo <- merge(tinfo, tmp, by=c('IDX_T','TAXA'), all.x=1)
#
# compute closest individual on true trees
#
tmp <- unique(subset(tinfo, select=c(SC, BRL_T, IDX_T)))
tmp <- tmp[, {
print(IDX_T)
ph <- ttrs[[IDX_T]]
model.reg <- grepl('REGIONAL',SC)
treedist.closest.ind(ph, model.reg)
}, by=c('SC','BRL_T','IDX_T')]
tinfo <- merge(tinfo, tmp, by=c('SC','BRL_T','IDX_T','IDPOP'))
set(tinfo, NULL, 'IDPOP_CL', tinfo[, gsub('IDPOP_','',IDPOP_CL)])
#
# add if transmitter sampled
#
tmp <- subset(tinfo, grepl('REGIONAL',SC))
set(tmp, NULL, 'IDPOP', tmp[,as.integer(gsub('IDPOP_','',IDPOP))])
setkey(tmp, IDX_T, IDPOP)
tmp <- unique(tmp)[, {
z <- IDX_T
list(IDTR_SAMPLED=ifelse(IDTR%in%subset(tmp, IDX_T==z)[['IDPOP']], 'Y', 'N'))
}, by=c('IDX_T','IDPOP')]
set(tmp, NULL, 'IDPOP', tmp[, paste('IDPOP_',IDPOP,sep='')])
tinfo <- merge(tinfo, tmp, by=c('IDX_T','IDPOP'),all.x=1)
#
# get submitted trees
#
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/running_gaps'
infiles <- list.files(indir, pattern='newick$', recursive=1, full.names=1)
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/running_gaps2'
infiles <- c(infiles, list.files(indir, pattern='newick$', recursive=1, full.names=1))
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/simple_GTR'
infiles <- c(infiles, list.files(indir, pattern='newick$', recursive=1, full.names=1))
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/partiallen'
infiles <- c(infiles, list.files(indir, pattern='newick$', recursive=1, full.names=1))
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/running_gaps3'
infiles <- c(infiles, list.files(indir, pattern='newick$', recursive=1, full.names=1))
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/running_gaps4'
infiles <- c(infiles, list.files(indir, pattern='newick$', recursive=1, full.names=1))
infiles <- data.table(FILE=infiles)
strs <- lapply(infiles[, FILE], function(x)
{
cat('\n',x)
read.tree(file=x)
})
names(strs) <- infiles[, FILE]
#
#
#
submitted.info <- data.table(FILE=names(strs))
submitted.info[, IDX:=seq_along(strs)]
#
# set team
#
submitted.info[, TEAM:=NA_character_]
set(submitted.info, submitted.info[, which(grepl('running_gaps/',FILE))], 'TEAM', 'RUNGAPS_ExaML')
set(submitted.info, submitted.info[, which(grepl('running_gaps2',FILE))], 'TEAM', 'RUNGAPS_EXCLTAXA')
set(submitted.info, submitted.info[, which(grepl('running_gaps3',FILE))], 'TEAM', 'RUNGAPS_EXCLSITE')
set(submitted.info, submitted.info[, which(grepl('simple_GTR',FILE))], 'TEAM', 'GTRFIXED')
set(submitted.info, submitted.info[, which(grepl('partiallen',FILE))], 'TEAM', 'PLEN')
set(submitted.info, submitted.info[, which(grepl('running_gaps4',FILE) & !grepl('_PL[0-9]+_',FILE))], 'TEAM', 'RUNGAPS_ExaML')
set(submitted.info, submitted.info[, which(grepl('running_gaps4',FILE) & grepl('_PL[0-9]+_',FILE))], 'TEAM', 'PLEN')
stopifnot( submitted.info[, length(which(is.na(TEAM)))==0] )
#
# scenario
#
submitted.info[, SC:=NA_character_]
tmp <- submitted.info[, which(grepl('161121_GTRFIXED[0-9]', FILE))]
set(submitted.info, tmp, 'SC', submitted.info[tmp, gsub('161121_','161121_REGIONAL_',regmatches(FILE, regexpr('161121_GTRFIXED[0-9]',FILE)))])
tmp <- submitted.info[, which(grepl('150701_Regional_TRAIN2', FILE))]
set(submitted.info, tmp, 'SC', submitted.info[tmp, regmatches(FILE, regexpr('150701_Regional_TRAIN2',FILE))])
tmp <- submitted.info[, which(grepl('150701_Regional_TRAIN1|161125_Regional_TRAIN1', FILE))]
set(submitted.info, tmp, 'SC', '150701_REGIONAL_TRAIN1')
tmp <- submitted.info[, which(grepl('161125_Regional_TRAIN6|161121_Regional_TRAIN6', FILE))]
set(submitted.info, tmp, 'SC', '161121_REGIONAL_TRAIN6')
set(submitted.info, NULL, 'SC', submitted.info[, toupper(SC)])
stopifnot( submitted.info[, length(which(is.na(SC)))==0] )
#
# define running gaps for the running gaps analyses
#
submitted.info[, RUNGAPS:=NA_real_]
tmp <- submitted.info[, which(grepl('RUNGAPS',TEAM))]
set(submitted.info, tmp, 'RUNGAPS', submitted.info[tmp, as.numeric(gsub('.*TRAIN[0-9]([0-9][0-9]).*','\\1',regmatches(FILE,regexpr('TRAIN[0-9]+',FILE))))/100])
tmp <- submitted.info[, which('PLEN'==TEAM)]
set(submitted.info, tmp, 'RUNGAPS', 0)
stopifnot( !nrow(subset(submitted.info, is.na(RUNGAPS) & grepl('RUNGAPS',TEAM))) )
#
# define running gaps2 selected fraction
#
submitted.info[, RUNGAPS_EXCL:=NA_real_]
tmp <- submitted.info[, which(TEAM=='RUNGAPS_EXCLTAXA')]
set(submitted.info, tmp, 'RUNGAPS_EXCL', submitted.info[tmp, as.numeric(gsub('.*TRAIN[0-9][0-9][0-9]([0-9][0-9]).*','\\1',FILE))/100])
tmp <- submitted.info[, which(TEAM=='RUNGAPS_EXCLSITE')]
set(submitted.info, tmp, 'RUNGAPS_EXCL', submitted.info[tmp, as.numeric(gsub('EXCLSITES','',regmatches(FILE, regexpr('EXCLSITES[0-9]+',FILE))))/100])
tmp <- submitted.info[, which(TEAM=='RUNGAPS_ExaML')]
set(submitted.info, tmp, 'RUNGAPS_EXCL', 1)
stopifnot( !nrow(subset(submitted.info, is.na(RUNGAPS) & grepl('RUNGAPS',TEAM))) )
#
# define partial length
#
submitted.info[, PLEN:=NA_real_]
tmp <- submitted.info[, which(TEAM=='PLEN')]
set(submitted.info, tmp, 'PLEN', submitted.info[tmp, as.numeric(gsub('PL','',regmatches(FILE, regexpr('PL[0-9]+',FILE))))])
stopifnot( !nrow(subset(submitted.info, is.na(PLEN) & grepl('PLEN',TEAM))) )
#
# set covariates of scenarios
#
tmp <- data.table( SC= c("150701_REGIONAL_TRAIN1","150701_REGIONAL_TRAIN2","161121_REGIONAL_GTRFIXED1","161121_REGIONAL_GTRFIXED2","161121_REGIONAL_GTRFIXED3","150701_REGIONAL_TRAIN3","150701_REGIONAL_TRAIN4" ,"150701_REGIONAL_TRAIN5", "150701_VILL_SCENARIO-A", "150701_VILL_SCENARIO-B", "VILL_99_APR15","150701_VILL_SCENARIO-C", "150701_VILL_SCENARIO-D", "150701_VILL_SCENARIO-E","161121_REGIONAL_TRAIN6"),
MODEL= c('R','R','R','R','R','R','R','R','V','V','V','V','V','V','R'),
SEQCOV= c(0.16, 0.16, 0.16, 0.16, 0.16, 0.16, 0.16, 0.16, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6),
ACUTE= c('low', 'low', 'low', 'low', 'low', 'high', 'low', 'high', 'high', 'high', 'high', 'high', 'high', 'high', 'low'),
GAPS= c('none', 'low', 'none', 'low', 'high', 'low', 'high', 'high', 'low', 'high', 'none', 'none', 'low', 'high', 'none'),
ART= c('none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'fast', 'fast', 'fast', 'fast','none'),
EXT= c('5pc', '5pc', '5pc', '5pc', '5pc', '5pc', '5pc', '5pc', '~0pc', '~0pc', '~0pc', '~0pc', '~0pc', '~0pc','5pc') )
submitted.info <- merge(submitted.info, tmp, by='SC')
#
# set which gene used to construct tree (either pol or concatenated gag+pol+env)
#
submitted.info[, GENE:=NA_character_]
set(submitted.info, submitted.info[, which(grepl('_FULL_', FILE))], 'GENE', 'GAG+POL+ENV')
set(submitted.info, submitted.info[, which(grepl('_GAG_', FILE))], 'GENE', 'GAG')
set(submitted.info, submitted.info[, which(grepl('_P17_', FILE))], 'GENE', 'P17')
set(submitted.info, submitted.info[, which(grepl('TRAIN1_PL|TRAIN6_PL', FILE))], 'GENE', 'GAG+POL+ENV')
set(submitted.info, submitted.info[, which(grepl('161121_Regional_TRAIN600', FILE))], 'GENE', 'GAG+POL+ENV')
stopifnot(nrow(subset(submitted.info, is.na(GENE)))==0)
#subset(submitted.info, TEAM=='GTRFIXED')
#
# best tree for each scenario
#
submitted.info[, BEST:='N']
#
# set OTHER (ie old or some preliminary/unknown tree)
#
submitted.info[, OTHER:='N']
#
# add BRL_UNITS
#
submitted.info[, BRL:='subst']
#
# number taxa in tree
#
setkey(submitted.info, IDX)
submitted.info[, TAXAN:= sapply(strs, Ntip)]
#
# add index of true tree
#
require(phangorn)
tmp <- subset(tfiles, select=c('SC','IDX_T','BRL_T'))
setkey(tmp, SC, BRL_T)
tmp <- unique(tmp)
tmp <- dcast.data.table(tmp, SC~BRL_T, value.var='IDX_T')
setnames(tmp, c('subst','time'), c("SUB_IDX_T","TIME_IDX_T"))
submitted.info <- merge(submitted.info, tmp, by='SC')
submitted.info <- merge(submitted.info, unique(subset(tfiles, select=c('SC','TAXAN_T'))), by='SC')
#stopifnot(nrow(subset(submitted.info, TAXAN>TAXAN_T))==0)
#
# fix taxa names that teams have changed
#
tmp <- subset(submitted.info, TEAM=='IQTree' & MODEL=='R')[, IDX]
for(i in tmp)
{
strs[[i]]$tip.label <- sapply(strsplit(strs[[i]]$tip.label,'_'), function(x) paste(x[1],'_',x[2],'|',x[3],'|',x[4],'_',x[5],'|',x[6],sep='') )
}
for(i in seq_along(strs))
{
strs[[i]]$tip.label <- toupper(strs[[i]]$tip.label)
}
for(i in seq_along(ttrs))
{
ttrs[[i]]$tip.label <- toupper(ttrs[[i]]$tip.label)
}
tmp2 <- subset(tinfo, BRL_T=='time', select=c(IDPOP,SC,TAXA))
setkey(tmp2, IDPOP,SC,TAXA)
tmp2 <- unique(tmp2)
tmp <- subset(submitted.info, TEAM=='PhyML' & MODEL=='R')[, IDX]
for(i in tmp)
{
cat(i,'\n')
z <- data.table(IDX=seq_along(strs[[i]]$tip.label), IDPOP=regmatches(strs[[i]]$tip.label, regexpr('IDPOP_[0-9]+',strs[[i]]$tip.label)), SC=subset(submitted.info, IDX==i)[,SC])
z <- merge(tmp2, z, by=c('IDPOP','SC'))
setkey(z, IDX)
stopifnot(nrow(z)==Ntip(strs[[i]]))
strs[[i]]$tip.label <- z[, TAXA]
}
tmp <- subset(submitted.info, TEAM=='PhyML' & MODEL=='V')[, IDX]
for(i in tmp)
{
cat(i,'\n')
z <- data.table(IDX=seq_along(strs[[i]]$tip.label), IDPOP=regmatches(strs[[i]]$tip.label, regexpr('HOUSE[0-9]+-[0-9]+|House[0-9]+-[0-9]+',strs[[i]]$tip.label)), SC=subset(submitted.info, IDX==i)[,SC])
z <- merge(tmp2, z, by=c('IDPOP','SC'))
stopifnot(nrow(z)==length(strs[[i]]$tip.label))
setkey(z, IDX)
strs[[i]]$tip.label <- z[, TAXA]
}
#
# check labels and remove labels that do not appear in the observed tree
# if additional labels are HXB2, root tree at HXB2
#
tmp <- submitted.info[, {
stree <- unroot(strs[[IDX]])
otree <- unroot(ttrs[[TIME_IDX_T]])
z <- setdiff(otree$tip.label, stree$tip.label)
list(CHECK= length(z)==abs(diff(c(TAXAN, TAXAN_T))) )
}, by='IDX']
tmp <- merge(subset(tmp, !CHECK), submitted.info, by='IDX')
for(i in seq_len(nrow(tmp)))
{
j <- tmp[i, IDX]
cat('\n',j)
otree <- tmp[i, TIME_IDX_T]
stree <- unroot(strs[[j]])
otree <- unroot(ttrs[[otree]])
z <- merge( data.table(TAXA=stree$tip.label, TYPE='s'), data.table(TAXA=otree$tip.label, TYPE='o'), by='TAXA', all=1)
z <- subset( z, is.na(TYPE.y))[, TAXA]
if(any(grepl('HXB2',z)))
{
zz <- z[grepl('HXB2',z)]
stree <- phytools:::reroot(stree, which(stree$tip.label==zz))
stree <- drop.tip(stree, zz)
z <- setdiff(zz,z)
}
if(length(z))
{
stree <- drop.tip(stree, z)
}
strs[[j]] <- stree
}
submitted.info[, TAXAN:= sapply(strs, Ntip)]
#
# are trees rooted?
#
setkey(submitted.info, IDX)
submitted.info[, ROOTED:=factor(sapply(strs, is.rooted),levels=c(TRUE,FALSE),labels=c('Y','N'))]
#
#outdir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/evaluation'
#save(strs, ttrs, trungps, tinfo, tfiles, tbrl, submitted.info, file=file.path(outdir,'submitted_161123.rda'))
#
# read LSD trees
#
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/LSD_2'
infiles <- data.table(FILE=list.files(indir, pattern='LSD.date.newick$', full.names=TRUE))
infiles[, IDX:= as.integer(gsub('IDX_','',regmatches(basename(FILE),regexpr('IDX_[0-9]+',basename(FILE)))))]
setkey(infiles, IDX)
strs_lsd <- vector('list', submitted.info[, max(IDX)])
for(i in seq_len(nrow(infiles)))
{
# i<- 439
IDX <- infiles[i,IDX]
FILE <- infiles[i,FILE]
cat('\n',IDX)
ph <- read.tree(FILE)
stopifnot( !is.null(ph) )
stopifnot( identical(sort(strs[[IDX]]$tip.label), sort(ph$tip.label)) )
strs_lsd[[IDX]] <- ph
#names(strs_lsd[[IDX]]) <- FILE
}
setkey(submitted.info, IDX)
submitted.info[, WITH_LSD:= factor(sapply(strs_lsd, is.null), levels=c(TRUE,FALSE), labels=c('N','Y'))]
submitted.info <- subset(submitted.info, WITH_LSD=='Y')
#
# re-root simulated trees at root of LSD tree
#
strs_rtt <- vector('list', length(strs))
for(i in submitted.info[, IDX])
{
cat('\n',i)
ph <- strs[[i]]
phl <- strs_lsd[[i]]
# figure out taxon with shortest heigh in rooted lsd tree
tmp <- Ancestors(phl, seq_len(Ntip(phl)), type="all")
tmp2 <- sapply(tmp, length)
root.pivot <- which.min(tmp2)
root.pivot.name <- phl$tip.label[root.pivot]
# determine how many edges down from root.pivot the root is located
root.descend <- length(tmp[[root.pivot]])-1
# calculate 1 minus the proportion of the corresponding edge in ph to the root location
root.pos <- phl$edge.length[ which( phl$edge[,1]==rev(tmp[[root.pivot]])[1] ) ]
root.children <- phl$edge[which( phl$edge[,1]==rev(tmp[[root.pivot]])[1] ),2]
tmp <- sapply(root.children, function(x) x %in% c(tmp[[root.pivot]], root.pivot))
root.pos <- 1 - root.pos[tmp] / sum(root.pos)
# find the root child in ph
tmp <- which(ph$tip.label==root.pivot.name)
tmp2 <- Ancestors(ph, tmp, type="all")
root.node.child <- ifelse(root.descend==0, tmp, tmp2[root.descend])
# find the length of the branch to the root.child
# and get the bit at which the root is to be placed
root.pos <- root.pos * ph$edge.length[ which(ph$edge[,2]==root.node.child) ]
# reroot ph
ph <- reroot(ph, root.node.child, position=root.pos)
strs_rtt[[i]] <- ph
}
names(strs_rtt) <- names(strs)
#
# ladderize all trees
#
ttrs <- lapply(ttrs, ladderize)
strs <- lapply(strs, ladderize)
strs_rtt<- lapply(strs_rtt, function(ph){
if(!is.null(ph))
ph <- ladderize(ph)
ph
} )
#
# SAVE so far
#
outdir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/evaluation'
save(strs, strs_lsd, strs_rtt, ttrs, trungps, tinfo, tfiles, tbrl, submitted.info, file=file.path(outdir,'submitted_161123.rda'))
}
treecomparison.submissions.170101<- function()
{
require(data.table)
require(ape)
require(adephylo)
require(phangorn)
require(parallel)
#
# get true trees
#
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/simulations_trees'
tfiles <- list.files(indir, pattern='newick$', full.names=TRUE)
tfiles <- data.table( FILE_T= tfiles[ grepl('SUBSTTREE', tfiles) | grepl('Vill_99', tfiles) | grepl('.*DATEDTREE', tfiles) ] )
tfiles[, SC:= toupper(gsub('_SUBSTTREE|_DATEDTREE','',gsub('.newick','',basename(FILE_T))))]
tmp <- rbind( subset(tfiles, SC=='VILL_99_APR15'), subset(tfiles, SC=='VILL_99_APR15'), subset(tfiles, SC=='VILL_99_APR15') )
set(tmp, NULL, 'SC', c('150701_VILL_SCENARIO-C','150701_VILL_SCENARIO-D','150701_VILL_SCENARIO-E'))
tfiles <- rbind(tfiles, tmp)
#tmp <- list.files(indir, pattern='newick$', full.names=TRUE)
#tmp <- data.table( FILE_T= tmp[ grepl('*DATEDTREE', tmp) ] )
#tmp[, SC:= toupper(gsub('_SUBSTTREE|_DATEDTREE','',gsub('.newick','',basename(FILE_T))))]
#tfiles <- rbind(tfiles, tmp)
tfiles[, BRL_T:= 'time']
set(tfiles, NULL, 'SC', tfiles[, gsub('161121_GTR','161121_REGIONAL_GTR',SC)])
set(tfiles, tfiles[, which(grepl('REG',SC) & grepl('SUBST',FILE_T))], 'BRL_T', 'subst')
ttrs <- lapply(tfiles[, FILE_T], function(x) read.tree(file=x) )
names(ttrs) <- tfiles[, SC]
for(z in c('VILL_99_APR15','150701_VILL_SCENARIO-C','150701_VILL_SCENARIO-D','150701_VILL_SCENARIO-E'))
ttrs[[z]] <- root(ttrs[[z]], node=Ntip(ttrs[[z]])+2, resolve.root=1)
tfiles[, IDX_T:=seq_along(ttrs)]
tfiles[, TAXAN_T:= sapply(ttrs, Ntip)]
# patristic distances on true trees (by time and subst/site)
tbrl <- NULL
# info on true trees
tinfo <- merge(tfiles, do.call('rbind',lapply(seq_along(ttrs), function(i) data.table(TAXA=ttrs[[i]]$tip.label, IDX_T=i))), by='IDX_T')
tinfo[, IDPOP:=NA_character_]
tmp <- tinfo[, which(grepl('REGIONAL',SC))]
set(tinfo, tmp, 'IDPOP', tinfo[tmp,regmatches(TAXA, regexpr('IDPOP_[0-9]+',TAXA))])
tmp <- tinfo[, which(!grepl('REGIONAL',SC))]
set(tinfo, tmp, 'IDPOP', tinfo[tmp, regmatches(TAXA, regexpr('HOUSE[0-9]+-[0-9]+|House[0-9]+-[0-9]+',TAXA))])
stopifnot(subset(tinfo, grepl('VILL',SC))[, length(which(substring(TAXA,1,10)!=substring(IDPOP,1,10)))]==0)
stopifnot( tinfo[, length(which(is.na(IDPOP)))==0] )
set(tinfo, NULL, 'IDPOP', tinfo[,toupper(IDPOP)])
set(tinfo, NULL, 'TAXA', tinfo[,toupper(TAXA)])
# read cluster membership from DATEDCLUTREES
tmp <- list.files(indir, pattern='DATEDCLUTREES', full.names=TRUE)
tmp <- rbind( data.table( FILE_CLU_T= tmp,
SC= gsub('161121_GTR','161121_REGIONAL_GTR',toupper(gsub('_DATEDCLUTREES','',gsub('.newick','',basename(tmp))))),
BRL_T= 'time'),
data.table( FILE_CLU_T= tmp,
SC= gsub('161121_GTR','161121_REGIONAL_GTR',toupper(gsub('_DATEDCLUTREES','',gsub('.newick','',basename(tmp))))),
BRL_T= 'subst') )
tfiles <- merge(tfiles, tmp, by=c('SC','BRL_T'), all=1)
tmp <- subset(tfiles, !is.na(FILE_CLU_T))[, {
z <- read.tree(FILE_CLU_T)
do.call('rbind',lapply(seq_along(z), function(i) data.table(IDCLU=i, TAXA=z[[i]]$tip.label)))
}, by=c('SC','BRL_T')]
tinfo <- merge(tinfo, tmp, by=c('SC','BRL_T','TAXA'), all=1)
tmp <- subset(tinfo, !is.na(IDCLU))[, list(CLU_N= length(IDPOP)), by=c('SC','BRL_T','IDCLU')]
tinfo <- merge(tinfo, tmp, by=c('SC','BRL_T','IDCLU'), all=1)
# read sequences and determine %gappiness in full alignment
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/running_gaps_simulations2'
tmp <- list.files(indir, pattern='fa$|fasta$', full.names=TRUE)
trungps <- data.table( FILE_SEQ_T= tmp,
TEAM= 'RUNGAPS_EXCLTAXA',
SC='150701_Regional_TRAIN2',
GENE='FULL',
RUNGAPS= as.numeric(gsub('TRAIN[0-9]([0-9][0-9]).*','\\1',regmatches(tmp,regexpr('TRAIN[0-9]+',tmp))))/100,
RUNGAPS_EXCL= as.numeric(gsub('TRAIN[0-9][0-9][0-9]([0-9][0-9]).*','\\1',regmatches(tmp,regexpr('TRAIN[0-9]+',tmp))))/100
)
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/running_gaps_simulations3'
tmp <- list.files(indir, pattern='fa$|fasta$', full.names=TRUE)
tmp <- tmp[!grepl('WORST',tmp)]
tmp <- data.table( FILE_SEQ_T= tmp,
TEAM= 'RUNGAPS_EXCLSITE',
SC='150701_Regional_TRAIN2',
GENE=regmatches(tmp,regexpr('FULL|GAG|GAGPP|P17|FULL',tmp)),
RUNGAPS= as.numeric(gsub('TRAIN[0-9]([0-9][0-9]).*','\\1',regmatches(tmp,regexpr('TRAIN[0-9]+',tmp))))/100,
RUNGAPS_EXCL= as.numeric(gsub('EXCLSITES','',regmatches(tmp,regexpr('EXCLSITES[0-9]+',tmp))))/100
)
trungps <- rbind(trungps, tmp)
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/running_gaps_simulations'
tmp <- list.files(indir, pattern='fa$|fasta$', full.names=TRUE)
tmp <- data.table( FILE_SEQ_T= tmp,
TEAM= 'RUNGAPS_ExaML',
SC=gsub('(150701_Regional_TRAIN[0-9]).*','\\1',basename(tmp)),
GENE=regmatches(tmp,regexpr('FULL|GAG|GAGPP|P17|FULL',tmp)),
RUNGAPS= as.numeric(gsub('TRAIN[0-9]([0-9][0-9]).*','\\1',regmatches(tmp,regexpr('TRAIN[0-9]+',tmp))))/100,
RUNGAPS_EXCL= 1
)
trungps <- rbind(trungps, tmp)
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/simulations'
tmp <- list.files(indir, full.names=1, pattern='_TRAIN[0-9]+_SIMULATED.fa$|GTRFIXED.*_SIMULATED.fasta$')
tmp <- data.table( FILE_SEQ_T= tmp,
TEAM= NA_real_,
SC=gsub('161121_','161121_REGIONAL_',toupper(gsub('_SIMULATED.fa|_SIMULATED.fasta','',basename(tmp)))),
GENE=gsub('\\.fa$','FULL',regmatches(tmp,regexpr('\\.fa$|FULL|GAG|GAGPP|P17|FULL',tmp))),
RUNGAPS= NA_real_,
RUNGAPS_EXCL= NA_real_
)
trungps <- rbind(trungps, tmp)
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/running_gaps_simulations4'
tmp <- list.files(indir, full.names=1, pattern='_TRAIN[0-9]+_FULL_SIMULATED.fa$')
tmp <- data.table( FILE_SEQ_T= tmp,
TEAM= 'RUNGAPS_ExaML',
SC=toupper(gsub('[0-9][0-9]_FULL_SIMULATED.fa|_SIMULATED.fasta','',basename(tmp))),
GENE=gsub('\\.fa$','FULL',regmatches(tmp,regexpr('\\.fa$|FULL|GAG|GAGPP|P17|FULL',tmp))),
RUNGAPS= as.numeric(gsub('TRAIN[0-9]([0-9][0-9]).*','\\1',regmatches(tmp,regexpr('TRAIN[0-9]+',tmp))))/100,
RUNGAPS_EXCL= 1
)
trungps <- rbind(trungps, tmp)
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/running_gaps_simulations5'
tmp <- list.files(indir, pattern='fa$|fasta$', full.names=TRUE)
tmp <- data.table( FILE_SEQ_T= tmp,
TEAM= 'RUNGAPS_ExaML',
SC=gsub('(161121_Regional_TRAIN[0-9]).*','\\1',basename(tmp)),
GENE=regmatches(tmp,regexpr('FULL|GAG|GAGPP|P17|FULL',tmp)),
RUNGAPS= as.numeric(gsub('TRAIN[0-9]([0-9][0-9]).*','\\1',regmatches(tmp,regexpr('TRAIN[0-9]+',tmp))))/100,
RUNGAPS_EXCL= 1
)
trungps <- rbind(trungps, tmp)
#
trungps <- trungps[, {
# FILE_SEQ_T<- '/Users/Oliver/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/running_gaps_simulations2/150701_Regional_TRAIN20250_FULL_SIMULATED.fa'
cat('\n',FILE_SEQ_T)
z <- read.dna(FILE_SEQ_T, format='fasta')
ans <- sapply(seq_len(nrow(z)), function(i) base.freq(z[i,], all=1))
list( TAXA=rownames(z),
ACTG_P=apply(ans[c('a','c','t','g'),], 2, sum),
UNASS_P=ans['?',],
NCOL=ncol(z))
}, by=c('FILE_SEQ_T','SC','TEAM','GENE','RUNGAPS','RUNGAPS_EXCL')]
trungps <- trungps[, list(ACTG_P=mean(ACTG_P), UNASS_P=mean(UNASS_P), SITES_N=NCOL[1]), by=c('FILE_SEQ_T','SC','TEAM','GENE','RUNGAPS','RUNGAPS_EXCL')]
#
# to tinfo add actual transmitters
#
# check TRAIN1
load( '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/simulations_trees/150701_Regional_TRAIN1_SIMULATED_INTERNAL.R' )
ch <- subset(tinfo, SC=='150701_REGIONAL_TRAIN1' & BRL_T=='time', TAXA)
ch[, IDPOP:= as.integer(gsub('IDPOP_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',1)))]
ch[, GENDER_CH:= sapply(strsplit(TAXA,'|',fixed=1),'[[',2)]
ch[, DOB_CH:= as.numeric(gsub('DOB_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',3)))]
ch[, TIME_SEQ_CH:= as.numeric(sapply(strsplit(TAXA,'|',fixed=1),'[[',4))]
ch <- merge(subset(df.inds, select=c(IDPOP, GENDER, DOB, TIME_SEQ)), ch, by='IDPOP')
stopifnot( ch[, all(abs(DOB-DOB_CH)<=2*.Machine$double.eps)], ch[, all(GENDER==GENDER_CH)], ch[, all(abs(TIME_SEQ-TIME_SEQ_CH)<=0.001)] )
# OK :-) schedule adding IDPOP_T
df.trms[, IDPOP:= df.trms[, paste('IDPOP_',IDREC,sep='')]]
tinfo.add <- subset(df.trms, select=c(IDPOP, IDTR))
df.trms[, IDPOP:= df.trms[, paste('IDPOP_',IDTR,sep='')]]
tinfo.add <- merge(tinfo.add, subset(df.trms, select=c(IDPOP, IDREC)), by='IDPOP', all=1)
set(ch, NULL, 'IDPOP', ch[, paste('IDPOP_',IDPOP,sep='')])
tinfo.add <- merge(tinfo.add, subset(ch, select=IDPOP), by='IDPOP')
tinfo.add[, SC:='150701_REGIONAL_TRAIN1']
# check TRAIN2
load( '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/simulations_trees/150701_Regional_TRAIN2_SIMULATED_INTERNAL.R' )
ch <- subset(tinfo, SC=='150701_REGIONAL_TRAIN2' & BRL_T=='time', TAXA)
ch[, IDPOP:= as.integer(gsub('IDPOP_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',1)))]
ch[, GENDER_CH:= sapply(strsplit(TAXA,'|',fixed=1),'[[',2)]
ch[, DOB_CH:= as.numeric(gsub('DOB_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',3)))]
ch[, TIME_SEQ_CH:= as.numeric(sapply(strsplit(TAXA,'|',fixed=1),'[[',4))]
ch <- merge(subset(df.inds, select=c(IDPOP, GENDER, DOB, TIME_SEQ)), ch, by='IDPOP')
stopifnot( ch[, all(abs(DOB-DOB_CH)<=2*.Machine$double.eps)], ch[, all(GENDER==GENDER_CH)], ch[, all(abs(TIME_SEQ-TIME_SEQ_CH)<=0.001)] )
# OK :-) schedule adding IDPOP_T
df.trms[, IDPOP:= df.trms[, paste('IDPOP_',IDREC,sep='')]]
tmp <- subset(df.trms, select=c(IDPOP, IDTR))
df.trms[, IDPOP:= df.trms[, paste('IDPOP_',IDTR,sep='')]]
tmp <- merge(tmp, subset(df.trms, select=c(IDPOP, IDREC)), by='IDPOP', all=1)
set(ch, NULL, 'IDPOP', ch[, paste('IDPOP_',IDPOP,sep='')])
tmp <- merge(tmp, subset(ch, select=IDPOP), by='IDPOP')
tmp[, SC:='150701_REGIONAL_TRAIN2']
tinfo.add <- rbind(tinfo.add, tmp)
# check TRAIN4
ch <- subset(tinfo, SC=='150701_REGIONAL_TRAIN4' & BRL_T=='time', TAXA)
ch[, IDPOP:= as.integer(gsub('IDPOP_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',1)))]
ch[, GENDER_CH:= sapply(strsplit(TAXA,'|',fixed=1),'[[',2)]
ch[, DOB_CH:= as.numeric(gsub('DOB_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',3)))]
ch[, TIME_SEQ_CH:= as.numeric(sapply(strsplit(TAXA,'|',fixed=1),'[[',4))]
ch <- merge(subset(df.inds, select=c(IDPOP, GENDER, DOB, TIME_SEQ)), ch, by='IDPOP')
stopifnot( ch[, all(abs(DOB-DOB_CH)<=2*.Machine$double.eps)], ch[, all(GENDER==GENDER_CH)], ch[, all(abs(TIME_SEQ-TIME_SEQ_CH)<=0.001)] )
tmp[, SC:='150701_REGIONAL_TRAIN4']
tinfo.add <- rbind(tinfo.add, tmp)
# check TRAIN3
load( '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/simulations_trees/150701_Regional_TRAIN3_SIMULATED_INTERNAL.R' )
ch <- subset(tinfo, SC=='150701_REGIONAL_TRAIN3' & BRL_T=='time', TAXA)
ch[, IDPOP:= as.integer(gsub('IDPOP_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',1)))]
ch[, GENDER_CH:= sapply(strsplit(TAXA,'|',fixed=1),'[[',2)]
ch[, DOB_CH:= as.numeric(gsub('DOB_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',3)))]
ch[, TIME_SEQ_CH:= as.numeric(sapply(strsplit(TAXA,'|',fixed=1),'[[',4))]
ch <- merge(subset(df.inds, select=c(IDPOP, GENDER, DOB, TIME_SEQ)), ch, by='IDPOP')
stopifnot( ch[, all(abs(DOB-DOB_CH)<=2*.Machine$double.eps)], ch[, all(GENDER==GENDER_CH)], ch[, all(abs(TIME_SEQ-TIME_SEQ_CH)<=0.001)] )
# OK :-) schedule adding IDPOP_T
df.trms[, IDPOP:= df.trms[, paste('IDPOP_',IDREC,sep='')]]
tmp <- subset(df.trms, select=c(IDPOP, IDTR))
df.trms[, IDPOP:= df.trms[, paste('IDPOP_',IDTR,sep='')]]
tmp <- merge(tmp, subset(df.trms, select=c(IDPOP, IDREC)), by='IDPOP', all=1)
set(ch, NULL, 'IDPOP', ch[, paste('IDPOP_',IDPOP,sep='')])
tmp <- merge(tmp, subset(ch, select=IDPOP), by='IDPOP')
tmp[, SC:='150701_REGIONAL_TRAIN3']
tinfo.add <- rbind(tinfo.add, tmp)
# check TRAIN5
ch <- subset(tinfo, SC=='150701_REGIONAL_TRAIN5' & BRL_T=='time', TAXA)
ch[, IDPOP:= as.integer(gsub('IDPOP_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',1)))]
ch[, GENDER_CH:= sapply(strsplit(TAXA,'|',fixed=1),'[[',2)]
ch[, DOB_CH:= as.numeric(gsub('DOB_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',3)))]
ch[, TIME_SEQ_CH:= as.numeric(sapply(strsplit(TAXA,'|',fixed=1),'[[',4))]
ch <- merge(subset(df.inds, select=c(IDPOP, GENDER, DOB, TIME_SEQ)), ch, by='IDPOP')
stopifnot( ch[, all(abs(DOB-DOB_CH)<=2*.Machine$double.eps)], ch[, all(GENDER==GENDER_CH)], ch[, all(abs(TIME_SEQ-TIME_SEQ_CH)<=0.001)] )
tmp[, SC:='150701_REGIONAL_TRAIN5']
tinfo.add <- rbind(tinfo.add, tmp)
# check TRAIN6
load( '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/simulations_trees/161121_Regional_TRAIN6_SIMULATED_INTERNAL.R' )
ch <- subset(tinfo, SC=='161121_REGIONAL_TRAIN6' & BRL_T=='time', TAXA)
ch[, IDPOP:= as.integer(gsub('IDPOP_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',1)))]
ch[, GENDER_CH:= sapply(strsplit(TAXA,'|',fixed=1),'[[',2)]
ch[, DOB_CH:= as.numeric(gsub('DOB_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',3)))]
ch[, TIME_SEQ_CH:= as.numeric(sapply(strsplit(TAXA,'|',fixed=1),'[[',4))]
ch <- merge(subset(df.inds, select=c(IDPOP, GENDER, DOB, TIME_SEQ)), ch, by='IDPOP')
stopifnot( ch[, all(abs(DOB-DOB_CH)<=2*.Machine$double.eps)], ch[, all(GENDER==GENDER_CH)], ch[, all(abs(TIME_SEQ-TIME_SEQ_CH)<=0.001)] )
# OK :-) schedule adding IDPOP_T
df.trms[, IDPOP:= df.trms[, paste('IDPOP_',IDREC,sep='')]]
tmp <- subset(df.trms, select=c(IDPOP, IDTR))
df.trms[, IDPOP:= df.trms[, paste('IDPOP_',IDTR,sep='')]]
tmp <- merge(tmp, subset(df.trms, select=c(IDPOP, IDREC)), by='IDPOP', all=1)
set(ch, NULL, 'IDPOP', ch[, paste('IDPOP_',IDPOP,sep='')])
tmp <- merge(tmp, subset(ch, select=IDPOP), by='IDPOP')
tmp[, SC:='161121_REGIONAL_TRAIN6']
tinfo.add <- rbind(tinfo.add, tmp)
# check TRAIN7
load( '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/simulations_trees/161121_Regional_TRAIN7_SIMULATED_INTERNAL.R' )
ch <- subset(tinfo, SC=='161121_REGIONAL_TRAIN7' & BRL_T=='time', TAXA)
ch[, IDPOP:= as.integer(gsub('IDPOP_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',1)))]
ch[, GENDER_CH:= sapply(strsplit(TAXA,'|',fixed=1),'[[',2)]
ch[, DOB_CH:= as.numeric(gsub('DOB_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',3)))]
ch[, TIME_SEQ_CH:= as.numeric(sapply(strsplit(TAXA,'|',fixed=1),'[[',4))]
ch <- merge(subset(df.inds, select=c(IDPOP, GENDER, DOB, TIME_SEQ)), ch, by='IDPOP')
stopifnot( ch[, all(abs(DOB-DOB_CH)<=2*.Machine$double.eps)], ch[, all(GENDER==GENDER_CH)], ch[, all(abs(TIME_SEQ-TIME_SEQ_CH)<=0.001)] )
# OK :-) schedule adding IDPOP_T
df.trms[, IDPOP:= df.trms[, paste('IDPOP_',IDREC,sep='')]]
tmp <- subset(df.trms, select=c(IDPOP, IDTR))
df.trms[, IDPOP:= df.trms[, paste('IDPOP_',IDTR,sep='')]]
tmp <- merge(tmp, subset(df.trms, select=c(IDPOP, IDREC)), by='IDPOP', all=1)
set(ch, NULL, 'IDPOP', ch[, paste('IDPOP_',IDPOP,sep='')])
tmp <- merge(tmp, subset(ch, select=IDPOP), by='IDPOP')
tmp[, SC:='161121_REGIONAL_TRAIN7']
tinfo.add <- rbind(tinfo.add, tmp)
# check TRAIN8
load( '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/simulations_trees/161121_Regional_TRAIN8_SIMULATED_INTERNAL.R' )
ch <- subset(tinfo, SC=='161121_REGIONAL_TRAIN8' & BRL_T=='time', TAXA)
ch[, IDPOP:= as.integer(gsub('IDPOP_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',1)))]
ch[, GENDER_CH:= sapply(strsplit(TAXA,'|',fixed=1),'[[',2)]
ch[, DOB_CH:= as.numeric(gsub('DOB_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',3)))]
ch[, TIME_SEQ_CH:= as.numeric(sapply(strsplit(TAXA,'|',fixed=1),'[[',4))]
ch <- merge(subset(df.inds, select=c(IDPOP, GENDER, DOB, TIME_SEQ)), ch, by='IDPOP')
stopifnot( ch[, all(abs(DOB-DOB_CH)<=2*.Machine$double.eps)], ch[, all(GENDER==GENDER_CH)], ch[, all(abs(TIME_SEQ-TIME_SEQ_CH)<=0.001)] )
# OK :-) schedule adding IDPOP_T
df.trms[, IDPOP:= df.trms[, paste('IDPOP_',IDREC,sep='')]]
tmp <- subset(df.trms, select=c(IDPOP, IDTR))
df.trms[, IDPOP:= df.trms[, paste('IDPOP_',IDTR,sep='')]]
tmp <- merge(tmp, subset(df.trms, select=c(IDPOP, IDREC)), by='IDPOP', all=1)
set(ch, NULL, 'IDPOP', ch[, paste('IDPOP_',IDPOP,sep='')])
tmp <- merge(tmp, subset(ch, select=IDPOP), by='IDPOP')
tmp[, SC:='161121_REGIONAL_TRAIN8']
tinfo.add <- rbind(tinfo.add, tmp)
# check GTRFIXED2
load( '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/simulations_trees/161121_GTRFIXED2_SIMULATED_INTERNAL.R' )
ch <- subset(tinfo, SC=='161121_REGIONAL_GTRFIXED2' & BRL_T=='time', TAXA)
ch[, IDPOP:= as.integer(gsub('IDPOP_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',1)))]
ch[, GENDER_CH:= sapply(strsplit(TAXA,'|',fixed=1),'[[',2)]
ch[, DOB_CH:= as.numeric(gsub('DOB_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',3)))]
ch[, TIME_SEQ_CH:= as.numeric(sapply(strsplit(TAXA,'|',fixed=1),'[[',4))]
ch <- merge(subset(df.inds, select=c(IDPOP, GENDER, DOB, TIME_SEQ)), ch, by='IDPOP')
stopifnot( ch[, all(abs(DOB-DOB_CH)<=2*.Machine$double.eps)], ch[, all(GENDER==GENDER_CH)], ch[, all(abs(TIME_SEQ-TIME_SEQ_CH)<=0.001)] )
# OK :-) schedule adding IDPOP_T
df.trms[, IDPOP:= df.trms[, paste('IDPOP_',IDREC,sep='')]]
tmp <- subset(df.trms, select=c(IDPOP, IDTR))
df.trms[, IDPOP:= df.trms[, paste('IDPOP_',IDTR,sep='')]]
tmp <- merge(tmp, subset(df.trms, select=c(IDPOP, IDREC)), by='IDPOP', all=1)
set(ch, NULL, 'IDPOP', ch[, paste('IDPOP_',IDPOP,sep='')])
tmp <- merge(tmp, subset(ch, select=IDPOP), by='IDPOP')
tmp[, SC:='161121_REGIONAL_GTRFIXED2']
tinfo.add <- rbind(tinfo.add, tmp)
# check GTRFIXED3
load( '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/simulations_trees/161121_GTRFIXED3_SIMULATED_INTERNAL.R' )
ch <- subset(tinfo, SC=='161121_REGIONAL_GTRFIXED3' & BRL_T=='time', TAXA)
ch[, IDPOP:= as.integer(gsub('IDPOP_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',1)))]
ch[, GENDER_CH:= sapply(strsplit(TAXA,'|',fixed=1),'[[',2)]
ch[, DOB_CH:= as.numeric(gsub('DOB_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',3)))]
ch[, TIME_SEQ_CH:= as.numeric(sapply(strsplit(TAXA,'|',fixed=1),'[[',4))]
ch <- merge(subset(df.inds, select=c(IDPOP, GENDER, DOB, TIME_SEQ)), ch, by='IDPOP')
stopifnot( ch[, all(abs(DOB-DOB_CH)<=2*.Machine$double.eps)], ch[, all(GENDER==GENDER_CH)], ch[, all(abs(TIME_SEQ-TIME_SEQ_CH)<=0.001)] )
# OK :-) schedule adding IDPOP_T
df.trms[, IDPOP:= df.trms[, paste('IDPOP_',IDREC,sep='')]]
tmp <- subset(df.trms, select=c(IDPOP, IDTR))
df.trms[, IDPOP:= df.trms[, paste('IDPOP_',IDTR,sep='')]]
tmp <- merge(tmp, subset(df.trms, select=c(IDPOP, IDREC)), by='IDPOP', all=1)
set(ch, NULL, 'IDPOP', ch[, paste('IDPOP_',IDPOP,sep='')])
tmp <- merge(tmp, subset(ch, select=IDPOP), by='IDPOP')
tmp[, SC:='161121_REGIONAL_GTRFIXED3']
tinfo.add <- rbind(tinfo.add, tmp)
# check GTRFIXED1
load( '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/simulations_trees/161121_GTRFIXED1_SIMULATED_INTERNAL.R' )
ch <- subset(tinfo, SC=='161121_REGIONAL_GTRFIXED1' & BRL_T=='time', TAXA)
ch[, IDPOP:= as.integer(gsub('IDPOP_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',1)))]
ch[, GENDER_CH:= sapply(strsplit(TAXA,'|',fixed=1),'[[',2)]
ch[, DOB_CH:= as.numeric(gsub('DOB_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',3)))]
ch[, TIME_SEQ_CH:= as.numeric(sapply(strsplit(TAXA,'|',fixed=1),'[[',4))]
ch <- merge(subset(df.inds, select=c(IDPOP, GENDER, DOB, TIME_SEQ)), ch, by='IDPOP')
stopifnot( ch[, all(abs(DOB-DOB_CH)<=2*.Machine$double.eps)], ch[, all(GENDER==GENDER_CH)], ch[, all(abs(TIME_SEQ-TIME_SEQ_CH)<=0.001)] )
# OK :-) schedule adding IDPOP_T
df.trms[, IDPOP:= df.trms[, paste('IDPOP_',IDREC,sep='')]]
tmp <- subset(df.trms, select=c(IDPOP, IDTR))
df.trms[, IDPOP:= df.trms[, paste('IDPOP_',IDTR,sep='')]]
tmp <- merge(tmp, subset(df.trms, select=c(IDPOP, IDREC)), by='IDPOP', all=1)
set(ch, NULL, 'IDPOP', ch[, paste('IDPOP_',IDPOP,sep='')])
tmp <- merge(tmp, subset(ch, select=IDPOP), by='IDPOP')
tmp[, SC:='161121_REGIONAL_GTRFIXED1']
tinfo.add <- rbind(tinfo.add, tmp)
#
# add transmitters for regional to tinfo
#
tinfo <- merge(tinfo, tinfo.add, by=c('IDPOP', 'SC'), all.x=1)
#
# add true node depths to tinfo
#
tmp <- tinfo[, {
cat(IDX_T,'\n')
ph<- ttrs[[IDX_T]]
list(DEPTH_T=node.depth.edgelength(ph)[seq_len(Ntip(ph))], TAXA=ph$tip.label)
}, by='IDX_T']
tinfo <- merge(tinfo, tmp, by=c('IDX_T','TAXA'), all.x=1)
#
# compute closest individual on true trees
#
tmp <- unique(subset(tinfo, select=c(SC, BRL_T, IDX_T)))
tmp <- tmp[, {
print(IDX_T)
ph <- ttrs[[IDX_T]]
model.reg <- grepl('REGIONAL',SC)
treedist.closest.ind(ph, model.reg)
}, by=c('SC','BRL_T','IDX_T')]
tinfo <- merge(tinfo, tmp, by=c('SC','BRL_T','IDX_T','IDPOP'))
set(tinfo, NULL, 'IDPOP_CL', tinfo[, gsub('IDPOP_','',IDPOP_CL)])
#
# add if transmitter sampled
#
tmp <- subset(tinfo, grepl('REGIONAL',SC))
set(tmp, NULL, 'IDPOP', tmp[,as.integer(gsub('IDPOP_','',IDPOP))])
tmp <- unique(tmp, by=c('IDX_T','IDPOP'))[, {
z <- IDX_T
list(IDTR_SAMPLED=ifelse(IDTR%in%subset(tmp, IDX_T==z)[['IDPOP']], 'Y', 'N'))
}, by=c('IDX_T','IDPOP')]
set(tmp, NULL, 'IDPOP', tmp[, paste('IDPOP_',IDPOP,sep='')])
tinfo <- merge(tinfo, tmp, by=c('IDX_T','IDPOP'),all.x=1)
#
# get submitted trees
#
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/running_gaps5'
infiles <- list.files(indir, pattern='newick$', recursive=1, full.names=1)
infiles <- data.table(FILE=infiles)
strs <- lapply(infiles[, FILE], function(x)
{
cat('\n',x)
read.tree(file=x)
})
names(strs) <- infiles[, FILE]
#
#
#
submitted.info <- data.table(FILE=names(strs))
submitted.info[, IDX:=seq_along(strs)]
#
# set team
#
submitted.info[, TEAM:=NA_character_]
set(submitted.info, submitted.info[, which(grepl('running_gaps5',FILE) & !grepl('_PL[0-9]+_',FILE))], 'TEAM', 'RUNGAPS_ExaML')
set(submitted.info, submitted.info[, which(grepl('running_gaps5',FILE) & grepl('_PL[0-9]+_',FILE))], 'TEAM', 'PLEN')
stopifnot( submitted.info[, length(which(is.na(TEAM)))==0] )
#
# scenario
#
submitted.info[, SC:=NA_character_]
tmp <- submitted.info[, which(grepl('161125_Regional_TRAIN7|161121_Regional_TRAIN7', FILE))]
set(submitted.info, tmp, 'SC', '161121_REGIONAL_TRAIN7')
tmp <- submitted.info[, which(grepl('161125_Regional_TRAIN8|161121_Regional_TRAIN8', FILE))]
set(submitted.info, tmp, 'SC', '161121_REGIONAL_TRAIN8')
set(submitted.info, NULL, 'SC', submitted.info[, toupper(SC)])
stopifnot( submitted.info[, length(which(is.na(SC)))==0] )
#
# define running gaps for the running gaps analyses
#
submitted.info[, RUNGAPS:=NA_real_]
tmp <- submitted.info[, which(grepl('RUNGAPS',TEAM))]
set(submitted.info, tmp, 'RUNGAPS', submitted.info[tmp, as.numeric(gsub('.*TRAIN[0-9]([0-9][0-9]).*','\\1',regmatches(FILE,regexpr('TRAIN[0-9]+',FILE))))/100])
tmp <- submitted.info[, which('PLEN'==TEAM)]
set(submitted.info, tmp, 'RUNGAPS', 0)
stopifnot( !nrow(subset(submitted.info, is.na(RUNGAPS) & grepl('RUNGAPS',TEAM))) )
#
# define running gaps2 selected fraction
#
submitted.info[, RUNGAPS_EXCL:=NA_real_]
tmp <- submitted.info[, which(TEAM=='RUNGAPS_ExaML')]
set(submitted.info, tmp, 'RUNGAPS_EXCL', 1)
stopifnot( !nrow(subset(submitted.info, is.na(RUNGAPS) & grepl('RUNGAPS',TEAM))) )
#
# define partial length
#
submitted.info[, PLEN:=NA_real_]
tmp <- submitted.info[, which(TEAM=='PLEN')]
set(submitted.info, tmp, 'PLEN', submitted.info[tmp, as.numeric(gsub('PL','',regmatches(FILE, regexpr('PL[0-9]+',FILE))))])
stopifnot( !nrow(subset(submitted.info, is.na(PLEN) & grepl('PLEN',TEAM))) )
#
# set covariates of scenarios
#
tmp <- data.table( SC= c("150701_REGIONAL_TRAIN1","150701_REGIONAL_TRAIN2","161121_REGIONAL_GTRFIXED1","161121_REGIONAL_GTRFIXED2","161121_REGIONAL_GTRFIXED3","150701_REGIONAL_TRAIN3","150701_REGIONAL_TRAIN4" ,"150701_REGIONAL_TRAIN5", "150701_VILL_SCENARIO-A", "150701_VILL_SCENARIO-B", "VILL_99_APR15","150701_VILL_SCENARIO-C", "150701_VILL_SCENARIO-D", "150701_VILL_SCENARIO-E","161121_REGIONAL_TRAIN6","161121_REGIONAL_TRAIN7","161121_REGIONAL_TRAIN8"),
MODEL= c('R','R','R','R','R','R','R','R','V','V','V','V','V','V','R','R','R'),
SEQCOV= c(0.16, 0.16, 0.16, 0.16, 0.16, 0.16, 0.16, 0.16, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.31, 0.15),
ACUTE= c('low', 'low', 'low', 'low', 'low', 'high', 'low', 'high', 'high', 'high', 'high', 'high', 'high', 'high','low','low','low'),
GAPS= c('none', 'low', 'none', 'low', 'high', 'low', 'high', 'high', 'low', 'high', 'none', 'none', 'low', 'high','none','none','none'),
ART= c('none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'fast', 'fast', 'fast', 'fast','none','none','none'),
EXT= c('5pc', '5pc', '5pc', '5pc', '5pc', '5pc', '5pc', '5pc', '~0pc', '~0pc', '~0pc', '~0pc', '~0pc', '~0pc','5pc','5pc','5pc') )
submitted.info <- merge(submitted.info, tmp, by='SC')
#
# set which gene used to construct tree (either pol or concatenated gag+pol+env)
#
submitted.info[, GENE:=NA_character_]
set(submitted.info, submitted.info[, which(grepl('_FULL_', FILE))], 'GENE', 'GAG+POL+ENV')
set(submitted.info, submitted.info[, which(grepl('_GAG_', FILE))], 'GENE', 'GAG')
set(submitted.info, submitted.info[, which(grepl('_P17_', FILE))], 'GENE', 'P17')
set(submitted.info, submitted.info[, which(grepl('TRAIN[0-9]_PL', FILE))], 'GENE', 'GAG+POL+ENV')
set(submitted.info, submitted.info[, which(grepl('161121_Regional_TRAIN600', FILE))], 'GENE', 'GAG+POL+ENV')
stopifnot(nrow(subset(submitted.info, is.na(GENE)))==0)
#subset(submitted.info, TEAM=='GTRFIXED')
#
# best tree for each scenario
#
submitted.info[, BEST:='N']
#
# set OTHER (ie old or some preliminary/unknown tree)
#
submitted.info[, OTHER:='N']
#
# add BRL_UNITS
#
submitted.info[, BRL:='subst']
#
# number taxa in tree
#
setkey(submitted.info, IDX)
submitted.info[, TAXAN:= sapply(strs, Ntip)]
#
# add index of true tree
#
require(phangorn)
tmp <- subset(tfiles, select=c('SC','IDX_T','BRL_T'))
tmp <- unique(tmp, by=c('SC','BRL_T'))
tmp <- dcast.data.table(tmp, SC~BRL_T, value.var='IDX_T')
setnames(tmp, c('subst','time'), c("SUB_IDX_T","TIME_IDX_T"))
submitted.info <- merge(submitted.info, tmp, by='SC')
submitted.info <- merge(submitted.info, unique(subset(tfiles, select=c('SC','TAXAN_T'))), by='SC')
#stopifnot(nrow(subset(submitted.info, TAXAN>TAXAN_T))==0)
#
# fix taxa names that teams have changed
#
tmp <- subset(submitted.info, TEAM=='IQTree' & MODEL=='R')[, IDX]
for(i in tmp)
{
strs[[i]]$tip.label <- sapply(strsplit(strs[[i]]$tip.label,'_'), function(x) paste(x[1],'_',x[2],'|',x[3],'|',x[4],'_',x[5],'|',x[6],sep='') )
}
for(i in seq_along(strs))
{
strs[[i]]$tip.label <- toupper(strs[[i]]$tip.label)
}
for(i in seq_along(ttrs))
{
ttrs[[i]]$tip.label <- toupper(ttrs[[i]]$tip.label)
}
tmp2 <- subset(tinfo, BRL_T=='time', select=c(IDPOP,SC,TAXA))
setkey(tmp2, IDPOP,SC,TAXA)
tmp2 <- unique(tmp2)
tmp <- subset(submitted.info, TEAM=='PhyML' & MODEL=='R')[, IDX]
for(i in tmp)
{
cat(i,'\n')
z <- data.table(IDX=seq_along(strs[[i]]$tip.label), IDPOP=regmatches(strs[[i]]$tip.label, regexpr('IDPOP_[0-9]+',strs[[i]]$tip.label)), SC=subset(submitted.info, IDX==i)[,SC])
z <- merge(tmp2, z, by=c('IDPOP','SC'))
setkey(z, IDX)
stopifnot(nrow(z)==Ntip(strs[[i]]))
strs[[i]]$tip.label <- z[, TAXA]
}
tmp <- subset(submitted.info, TEAM=='PhyML' & MODEL=='V')[, IDX]
for(i in tmp)
{
cat(i,'\n')
z <- data.table(IDX=seq_along(strs[[i]]$tip.label), IDPOP=regmatches(strs[[i]]$tip.label, regexpr('HOUSE[0-9]+-[0-9]+|House[0-9]+-[0-9]+',strs[[i]]$tip.label)), SC=subset(submitted.info, IDX==i)[,SC])
z <- merge(tmp2, z, by=c('IDPOP','SC'))
stopifnot(nrow(z)==length(strs[[i]]$tip.label))
setkey(z, IDX)
strs[[i]]$tip.label <- z[, TAXA]
}
#
# check labels and remove labels that do not appear in the observed tree
# if additional labels are HXB2, root tree at HXB2
#
tmp <- submitted.info[, {
stree <- unroot(strs[[IDX]])
otree <- unroot(ttrs[[TIME_IDX_T]])
z <- setdiff(otree$tip.label, stree$tip.label)
list(CHECK= length(z)==abs(diff(c(TAXAN, TAXAN_T))) )
}, by='IDX']
tmp <- merge(subset(tmp, !CHECK), submitted.info, by='IDX')
for(i in seq_len(nrow(tmp)))
{
j <- tmp[i, IDX]
cat('\n',j)
otree <- tmp[i, TIME_IDX_T]
stree <- unroot(strs[[j]])
otree <- unroot(ttrs[[otree]])
z <- merge( data.table(TAXA=stree$tip.label, TYPE='s'), data.table(TAXA=otree$tip.label, TYPE='o'), by='TAXA', all=1)
z <- subset( z, is.na(TYPE.y))[, TAXA]
if(any(grepl('HXB2',z)))
{
zz <- z[grepl('HXB2',z)]
stree <- phytools:::reroot(stree, which(stree$tip.label==zz))
stree <- drop.tip(stree, zz)
z <- setdiff(zz,z)
}
if(length(z))
{
stree <- drop.tip(stree, z)
}
strs[[j]] <- stree
}
submitted.info[, TAXAN:= sapply(strs, Ntip)]
#
# are trees rooted?
#
setkey(submitted.info, IDX)
submitted.info[, ROOTED:=factor(sapply(strs, is.rooted),levels=c(TRUE,FALSE),labels=c('Y','N'))]
#
#outdir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/evaluation'
#save(strs, ttrs, trungps, tinfo, tfiles, tbrl, submitted.info, file=file.path(outdir,'submitted_170101.rda'))
#
# read LSD trees
#
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/LSD_3'
infiles <- data.table(FILE=list.files(indir, pattern='LSD.date.newick$', full.names=TRUE))
infiles[, IDX:= as.integer(gsub('IDX_','',regmatches(basename(FILE),regexpr('IDX_[0-9]+',basename(FILE)))))]
setkey(infiles, IDX)
strs_lsd <- vector('list', submitted.info[, max(IDX)])
for(i in seq_len(nrow(infiles)))
{
# i<- 439
IDX <- infiles[i,IDX]
FILE <- infiles[i,FILE]
cat('\n',IDX)
ph <- readLines(FILE)
if(grepl('^\\(\\(\\):0,', ph) & grepl(':-?[0-9].?[0-9]*[eE]*-?[0-9]*\\);$', ph))
{
ph <- gsub(':-?[0-9].?[0-9]*[eE]*-?[0-9]*\\);$',';',gsub('^\\(\\(\\):0,','',ph))
}
ph <- read.tree(text=ph)
stopifnot( !is.null(ph) )
stopifnot( identical(sort(strs[[IDX]]$tip.label), sort(ph$tip.label)) )
strs_lsd[[IDX]] <- ph
#names(strs_lsd[[IDX]]) <- FILE
}
setkey(submitted.info, IDX)
submitted.info[, WITH_LSD:= factor(sapply(strs_lsd, is.null), levels=c(TRUE,FALSE), labels=c('N','Y'))]
submitted.info <- subset(submitted.info, WITH_LSD=='Y')
#
# ladderize all trees
#
ttrs <- lapply(ttrs, ladderize)
strs <- lapply(strs, ladderize)
strs_lsd<- lapply(strs_lsd, function(ph){
if(!is.null(ph))
ph <- ladderize(ph)
ph
} )
#
# SAVE so far
#
outdir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/evaluation'
save(strs, strs_lsd, ttrs, trungps, tinfo, tfiles, tbrl, submitted.info, file=file.path(outdir,'submitted_170101.rda'))
}
treecomparison.submissions.170424<- function()
{
require(data.table)
require(ape)
require(adephylo)
require(phangorn)
require(parallel)
#
# get true trees
#
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/simulations_trees'
tfiles <- list.files(indir, pattern='newick$', full.names=TRUE)
tfiles <- data.table( FILE_T= tfiles[ grepl('SUBSTTREE', tfiles) | grepl('Vill_99', tfiles) | grepl('.*DATEDTREE', tfiles) ] )
tfiles <- subset( tfiles, grepl('TRAIN1|TRAIN2|TRAIN4',FILE_T) )
tfiles[, SC:= toupper(gsub('_SUBSTTREE|_DATEDTREE','',gsub('.newick','',basename(FILE_T))))]
tmp <- rbind( subset(tfiles, SC=='VILL_99_APR15'), subset(tfiles, SC=='VILL_99_APR15'), subset(tfiles, SC=='VILL_99_APR15') )
set(tmp, NULL, 'SC', c('150701_VILL_SCENARIO-C','150701_VILL_SCENARIO-D','150701_VILL_SCENARIO-E'))
tfiles <- rbind(tfiles, tmp)
#tmp <- list.files(indir, pattern='newick$', full.names=TRUE)
#tmp <- data.table( FILE_T= tmp[ grepl('*DATEDTREE', tmp) ] )
#tmp[, SC:= toupper(gsub('_SUBSTTREE|_DATEDTREE','',gsub('.newick','',basename(FILE_T))))]
#tfiles <- rbind(tfiles, tmp)
tfiles[, BRL_T:= 'time']
set(tfiles, NULL, 'SC', tfiles[, gsub('161121_GTR','161121_REGIONAL_GTR',SC)])
set(tfiles, tfiles[, which(grepl('REG',SC) & grepl('SUBST',FILE_T))], 'BRL_T', 'subst')
ttrs <- lapply(tfiles[, FILE_T], function(x) read.tree(file=x) )
names(ttrs) <- tfiles[, SC]
tfiles[, IDX_T:=seq_along(ttrs)]
tfiles[, TAXAN_T:= sapply(ttrs, Ntip)]
# patristic distances on true trees (by time and subst/site)
tbrl <- NULL
# info on true trees
tinfo <- merge(tfiles, do.call('rbind',lapply(seq_along(ttrs), function(i) data.table(TAXA=ttrs[[i]]$tip.label, IDX_T=i))), by='IDX_T')
tinfo[, IDPOP:=NA_character_]
tmp <- tinfo[, which(grepl('REGIONAL',SC))]
set(tinfo, tmp, 'IDPOP', tinfo[tmp,regmatches(TAXA, regexpr('IDPOP_[0-9]+',TAXA))])
tmp <- tinfo[, which(!grepl('REGIONAL',SC))]
set(tinfo, tmp, 'IDPOP', tinfo[tmp, regmatches(TAXA, regexpr('HOUSE[0-9]+-[0-9]+|House[0-9]+-[0-9]+',TAXA))])
stopifnot(subset(tinfo, grepl('VILL',SC))[, length(which(substring(TAXA,1,10)!=substring(IDPOP,1,10)))]==0)
stopifnot( tinfo[, length(which(is.na(IDPOP)))==0] )
set(tinfo, NULL, 'IDPOP', tinfo[,toupper(IDPOP)])
set(tinfo, NULL, 'TAXA', tinfo[,toupper(TAXA)])
# read cluster membership from DATEDCLUTREES
tmp <- list.files(indir, pattern='DATEDCLUTREES', full.names=TRUE)
tmp <- tmp[grepl('TRAIN1|TRAIN2|TRAIN4',tmp)]
tmp <- rbind( data.table( FILE_CLU_T= tmp,
SC= gsub('161121_GTR','161121_REGIONAL_GTR',toupper(gsub('_DATEDCLUTREES','',gsub('.newick','',basename(tmp))))),
BRL_T= 'time'),
data.table( FILE_CLU_T= tmp,
SC= gsub('161121_GTR','161121_REGIONAL_GTR',toupper(gsub('_DATEDCLUTREES','',gsub('.newick','',basename(tmp))))),
BRL_T= 'subst') )
tfiles <- merge(tfiles, tmp, by=c('SC','BRL_T'), all=1)
tmp <- subset(tfiles, !is.na(FILE_CLU_T))[, {
z <- read.tree(FILE_CLU_T)
do.call('rbind',lapply(seq_along(z), function(i) data.table(IDCLU=i, TAXA=z[[i]]$tip.label)))
}, by=c('SC','BRL_T')]
tinfo <- merge(tinfo, tmp, by=c('SC','BRL_T','TAXA'), all=1)
tmp <- subset(tinfo, !is.na(IDCLU))[, list(CLU_N= length(IDPOP)), by=c('SC','BRL_T','IDCLU')]
tinfo <- merge(tinfo, tmp, by=c('SC','BRL_T','IDCLU'), all=1)
# read sequences and determine %gappiness in full alignment
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/simulations'
tmp <- list.files(indir, full.names=1, pattern='_TRAIN[0-9]+_SIMULATED.fa$|GTRFIXED.*_SIMULATED.fasta$')
tmp <- tmp[grepl('TRAIN1|TRAIN2|TRAIN4',tmp)]
trungps <- data.table( FILE_SEQ_T= tmp,
TEAM= NA_real_,
SC=gsub('161121_','161121_REGIONAL_',toupper(gsub('_SIMULATED.fa|_SIMULATED.fasta','',basename(tmp)))),
GENE=gsub('\\.fa$','FULL',regmatches(tmp,regexpr('\\.fa$|FULL|GAG|GAGPP|P17|FULL',tmp))),
RUNGAPS= NA_real_,
RUNGAPS_EXCL= NA_real_
)
#
trungps <- trungps[, {
# FILE_SEQ_T<- '/Users/Oliver/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/running_gaps_simulations2/150701_Regional_TRAIN20250_FULL_SIMULATED.fa'
cat('\n',FILE_SEQ_T)
z <- read.dna(FILE_SEQ_T, format='fasta')
ans <- sapply(seq_len(nrow(z)), function(i) base.freq(z[i,], all=1))
list( TAXA=rownames(z),
ACTG_P=apply(ans[c('a','c','t','g'),], 2, sum),
UNASS_P=ans['?',],
NCOL=ncol(z))
}, by=c('FILE_SEQ_T','SC','TEAM','GENE','RUNGAPS','RUNGAPS_EXCL')]
trungps <- trungps[, list(ACTG_P=mean(ACTG_P), UNASS_P=mean(UNASS_P), SITES_N=NCOL[1]), by=c('FILE_SEQ_T','SC','TEAM','GENE','RUNGAPS','RUNGAPS_EXCL')]
#
# to tinfo add actual transmitters
#
# check TRAIN1
load( '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/simulations_trees/150701_Regional_TRAIN1_SIMULATED_INTERNAL.R' )
ch <- subset(tinfo, SC=='150701_REGIONAL_TRAIN1' & BRL_T=='time', TAXA)
ch[, IDPOP:= as.integer(gsub('IDPOP_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',1)))]
ch[, GENDER_CH:= sapply(strsplit(TAXA,'|',fixed=1),'[[',2)]
ch[, DOB_CH:= as.numeric(gsub('DOB_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',3)))]
ch[, TIME_SEQ_CH:= as.numeric(sapply(strsplit(TAXA,'|',fixed=1),'[[',4))]
ch <- merge(subset(df.inds, select=c(IDPOP, GENDER, DOB, TIME_SEQ)), ch, by='IDPOP')
stopifnot( ch[, all(abs(DOB-DOB_CH)<=2*.Machine$double.eps)], ch[, all(GENDER==GENDER_CH)], ch[, all(abs(TIME_SEQ-TIME_SEQ_CH)<=0.001)] )
# OK :-) schedule adding IDPOP_T
df.trms[, IDPOP:= df.trms[, paste('IDPOP_',IDREC,sep='')]]
tinfo.add <- subset(df.trms, select=c(IDPOP, IDTR))
df.trms[, IDPOP:= df.trms[, paste('IDPOP_',IDTR,sep='')]]
tinfo.add <- merge(tinfo.add, subset(df.trms, select=c(IDPOP, IDREC)), by='IDPOP', all=1)
set(ch, NULL, 'IDPOP', ch[, paste('IDPOP_',IDPOP,sep='')])
tinfo.add <- merge(tinfo.add, subset(ch, select=IDPOP), by='IDPOP')
tinfo.add[, SC:='150701_REGIONAL_TRAIN1']
# check TRAIN2
load( '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/simulations_trees/150701_Regional_TRAIN2_SIMULATED_INTERNAL.R' )
ch <- subset(tinfo, SC=='150701_REGIONAL_TRAIN2' & BRL_T=='time', TAXA)
ch[, IDPOP:= as.integer(gsub('IDPOP_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',1)))]
ch[, GENDER_CH:= sapply(strsplit(TAXA,'|',fixed=1),'[[',2)]
ch[, DOB_CH:= as.numeric(gsub('DOB_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',3)))]
ch[, TIME_SEQ_CH:= as.numeric(sapply(strsplit(TAXA,'|',fixed=1),'[[',4))]
ch <- merge(subset(df.inds, select=c(IDPOP, GENDER, DOB, TIME_SEQ)), ch, by='IDPOP')
stopifnot( ch[, all(abs(DOB-DOB_CH)<=2*.Machine$double.eps)], ch[, all(GENDER==GENDER_CH)], ch[, all(abs(TIME_SEQ-TIME_SEQ_CH)<=0.001)] )
# OK :-) schedule adding IDPOP_T
df.trms[, IDPOP:= df.trms[, paste('IDPOP_',IDREC,sep='')]]
tmp <- subset(df.trms, select=c(IDPOP, IDTR))
df.trms[, IDPOP:= df.trms[, paste('IDPOP_',IDTR,sep='')]]
tmp <- merge(tmp, subset(df.trms, select=c(IDPOP, IDREC)), by='IDPOP', all=1)
set(ch, NULL, 'IDPOP', ch[, paste('IDPOP_',IDPOP,sep='')])
tmp <- merge(tmp, subset(ch, select=IDPOP), by='IDPOP')
tmp[, SC:='150701_REGIONAL_TRAIN2']
tinfo.add <- rbind(tinfo.add, tmp)
# check TRAIN4
ch <- subset(tinfo, SC=='150701_REGIONAL_TRAIN4' & BRL_T=='time', TAXA)
ch[, IDPOP:= as.integer(gsub('IDPOP_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',1)))]
ch[, GENDER_CH:= sapply(strsplit(TAXA,'|',fixed=1),'[[',2)]
ch[, DOB_CH:= as.numeric(gsub('DOB_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',3)))]
ch[, TIME_SEQ_CH:= as.numeric(sapply(strsplit(TAXA,'|',fixed=1),'[[',4))]
ch <- merge(subset(df.inds, select=c(IDPOP, GENDER, DOB, TIME_SEQ)), ch, by='IDPOP')
stopifnot( ch[, all(abs(DOB-DOB_CH)<=2*.Machine$double.eps)], ch[, all(GENDER==GENDER_CH)], ch[, all(abs(TIME_SEQ-TIME_SEQ_CH)<=0.001)] )
tmp[, SC:='150701_REGIONAL_TRAIN4']
tinfo.add <- rbind(tinfo.add, tmp)
#
# add transmitters for regional to tinfo
#
tinfo <- merge(tinfo, tinfo.add, by=c('IDPOP', 'SC'), all.x=1)
#
# add true node depths to tinfo
#
tmp <- tinfo[, {
cat(IDX_T,'\n')
ph<- ttrs[[IDX_T]]
list(DEPTH_T=node.depth.edgelength(ph)[seq_len(Ntip(ph))], TAXA=ph$tip.label)
}, by='IDX_T']
tinfo <- merge(tinfo, tmp, by=c('IDX_T','TAXA'), all.x=1)
#
# compute closest individual on true trees
#
tmp <- unique(subset(tinfo, select=c(SC, BRL_T, IDX_T)))
tmp <- tmp[, {
print(IDX_T)
ph <- ttrs[[IDX_T]]
model.reg <- grepl('REGIONAL',SC)
treedist.closest.ind(ph, model.reg)
}, by=c('SC','BRL_T','IDX_T')]
tinfo <- merge(tinfo, tmp, by=c('SC','BRL_T','IDX_T','IDPOP'))
set(tinfo, NULL, 'IDPOP_CL', tinfo[, gsub('IDPOP_','',IDPOP_CL)])
#
# add if transmitter sampled
#
tmp <- subset(tinfo, grepl('REGIONAL',SC))
set(tmp, NULL, 'IDPOP', tmp[,as.integer(gsub('IDPOP_','',IDPOP))])
tmp <- unique(tmp, by=c('IDX_T','IDPOP'))[, {
z <- IDX_T
list(IDTR_SAMPLED=ifelse(IDTR%in%subset(tmp, IDX_T==z)[['IDPOP']], 'Y', 'N'))
}, by=c('IDX_T','IDPOP')]
set(tmp, NULL, 'IDPOP', tmp[, paste('IDPOP_',IDPOP,sep='')])
tinfo <- merge(tinfo, tmp, by=c('IDX_T','IDPOP'),all.x=1)
#
# get submitted trees
#
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/FastTree'
infiles <- list.files(indir, pattern='newick$', recursive=1, full.names=1)
infiles <- data.table(FILE=infiles)
strs <- lapply(infiles[, FILE], function(x)
{
cat('\n',x)
read.tree(file=x)
})
names(strs) <- infiles[, FILE]
#
#
#
submitted.info <- data.table(FILE=names(strs))
submitted.info[, IDX:=seq_along(strs)]
#
# set team
#
submitted.info[, TEAM:=NA_character_]
set(submitted.info, submitted.info[, which(grepl('fasttreedefault',FILE))], 'TEAM', 'FT_DEFAULT')
set(submitted.info, submitted.info[, which(grepl('fasttreepseudo',FILE))], 'TEAM', 'FT_PSEUDO')
set(submitted.info, submitted.info[, which(grepl('fasttreeslow\\.',FILE))], 'TEAM', 'FT_SLOW')
set(submitted.info, submitted.info[, which(grepl('fasttreeslowpseudo',FILE))], 'TEAM', 'FT_SLOWPSEUDO')
stopifnot( submitted.info[, length(which(is.na(TEAM)))==0] )
#
# scenario
#
submitted.info[, SC:=NA_character_]
set(submitted.info, submitted.info[, which(grepl('TRAIN1', FILE))], 'SC', '150701_REGIONAL_TRAIN1')
set(submitted.info, submitted.info[, which(grepl('TRAIN2', FILE))], 'SC', '150701_REGIONAL_TRAIN2')
set(submitted.info, submitted.info[, which(grepl('TRAIN4', FILE))], 'SC', '150701_REGIONAL_TRAIN4')
stopifnot( submitted.info[, length(which(is.na(SC)))==0] )
#
# define running gaps for the running gaps analyses
# define running gaps2 selected fraction
# define partial length
submitted.info[, RUNGAPS:=NA_real_]
submitted.info[, RUNGAPS_EXCL:=NA_real_]
submitted.info[, PLEN:=NA_real_]
#
# set covariates of scenarios
#
tmp <- data.table( SC= c("150701_REGIONAL_TRAIN1","150701_REGIONAL_TRAIN2","161121_REGIONAL_GTRFIXED1","161121_REGIONAL_GTRFIXED2","161121_REGIONAL_GTRFIXED3","150701_REGIONAL_TRAIN3","150701_REGIONAL_TRAIN4" ,"150701_REGIONAL_TRAIN5", "150701_VILL_SCENARIO-A", "150701_VILL_SCENARIO-B", "VILL_99_APR15","150701_VILL_SCENARIO-C", "150701_VILL_SCENARIO-D", "150701_VILL_SCENARIO-E","161121_REGIONAL_TRAIN6","161121_REGIONAL_TRAIN7","161121_REGIONAL_TRAIN8"),
MODEL= c('R','R','R','R','R','R','R','R','V','V','V','V','V','V','R','R','R'),
SEQCOV= c(0.16, 0.16, 0.16, 0.16, 0.16, 0.16, 0.16, 0.16, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.31, 0.15),
ACUTE= c('low', 'low', 'low', 'low', 'low', 'high', 'low', 'high', 'high', 'high', 'high', 'high', 'high', 'high','low','low','low'),
GAPS= c('none', 'low', 'none', 'low', 'high', 'low', 'high', 'high', 'low', 'high', 'none', 'none', 'low', 'high','none','none','none'),
ART= c('none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'fast', 'fast', 'fast', 'fast','none','none','none'),
EXT= c('5pc', '5pc', '5pc', '5pc', '5pc', '5pc', '5pc', '5pc', '~0pc', '~0pc', '~0pc', '~0pc', '~0pc', '~0pc','5pc','5pc','5pc') )
submitted.info <- merge(submitted.info, tmp, by='SC')
#
# set which gene used to construct tree (either pol or concatenated gag+pol+env)
#
submitted.info[, GENE:='GAG+POL+ENV']
#
# best tree for each scenario
#
submitted.info[, BEST:='N']
#
# set OTHER (ie old or some preliminary/unknown tree)
#
submitted.info[, OTHER:='N']
#
# add BRL_UNITS
#
submitted.info[, BRL:='subst']
#
# number taxa in tree
#
setkey(submitted.info, IDX)
submitted.info[, TAXAN:= sapply(strs, Ntip)]
#
# add index of true tree
#
require(phangorn)
tmp <- subset(tfiles, select=c('SC','IDX_T','BRL_T'))
tmp <- unique(tmp, by=c('SC','BRL_T'))
tmp <- dcast.data.table(tmp, SC~BRL_T, value.var='IDX_T')
setnames(tmp, c('subst','time'), c("SUB_IDX_T","TIME_IDX_T"))
submitted.info <- merge(submitted.info, tmp, by='SC')
submitted.info <- merge(submitted.info, unique(subset(tfiles, select=c('SC','TAXAN_T'))), by='SC')
#
# check labels and remove labels that do not appear in the observed tree
# if additional labels are HXB2, root tree at HXB2
#
tmp <- submitted.info[, {
stree <- unroot(strs[[IDX]])
otree <- unroot(ttrs[[TIME_IDX_T]])
z <- setdiff(otree$tip.label, stree$tip.label)
list(CHECK= length(z)==abs(diff(c(TAXAN, TAXAN_T))) )
}, by='IDX']
tmp <- merge(subset(tmp, !CHECK), submitted.info, by='IDX')
stopifnot(!nrow(tmp))
submitted.info[, TAXAN:= sapply(strs, Ntip)]
#
# are trees rooted?
#
setkey(submitted.info, IDX)
submitted.info[, ROOTED:=factor(sapply(strs, is.rooted),levels=c(TRUE,FALSE),labels=c('Y','N'))]
#
# root with RTT
#
options(warn=2)
strs_rtt <- vector('list', length(strs))
for(i in submitted.info[, IDX])
{
cat('\n',i)
#i <- 628 ; i<- 241; i<- 571
ph <- strs[[i]]
tmp <- data.table(TAXA=ph$tip.label)
set(tmp, NULL, 'T_SEQ', tmp[, as.numeric(regmatches(TAXA, regexpr('[0-9]*\\.[0-9]+$|[0-9]+$', TAXA))) ])
#phr <- rtt(ph, tmp[, as.numeric(T_SEQ)])
phr <- rtt(ph, tmp[, T_SEQ], ncpu=4) #this may drop a tip!
strs_rtt[[i]] <- ladderize(phr)
}
names(strs_rtt) <- names(strs)
options(warn=0)
# set node labels
for(i in seq_along(strs))
{
strs[[i]][['node.label']]<- as.numeric(strs[[i]][['node.label']])
strs[[i]][['node.label']][ is.na(strs[[i]][['node.label']]) ] <- 0
}
for(i in seq_along(strs_rtt))
{
strs_rtt[[i]][['node.label']]<- as.numeric(strs_rtt[[i]][['node.label']])
strs_rtt[[i]][['node.label']][ is.na(strs_rtt[[i]][['node.label']]) ] <- 0
}
#
#outdir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/evaluation'
#save(strs, strs_rtt, ttrs, trungps, tinfo, tfiles, tbrl, submitted.info, file=file.path(outdir,'submitted_170424.rda'))
#
# read LSD trees
#
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/LSD_FastTree'
infiles <- data.table(FILE=list.files(indir, pattern='LSD.date.newick$', full.names=TRUE))
infiles[, IDX:= as.integer(gsub('IDX_','',regmatches(basename(FILE),regexpr('IDX_[0-9]+',basename(FILE)))))]
setkey(infiles, IDX)
strs_lsd <- vector('list', submitted.info[, max(IDX)])
for(i in seq_len(nrow(infiles)))
{
# i<- 439
IDX <- infiles[i,IDX]
FILE <- infiles[i,FILE]
cat('\n',IDX)
ph <- readLines(FILE)
if(grepl('^\\(\\(\\):0,', ph) & grepl(':-?[0-9].?[0-9]*[eE]*-?[0-9]*\\);$', ph))
{
ph <- gsub(':-?[0-9].?[0-9]*[eE]*-?[0-9]*\\);$',';',gsub('^\\(\\(\\):0,','',ph))
}
ph <- read.tree(text=ph)
stopifnot( !is.null(ph) )
stopifnot( identical(sort(strs[[IDX]]$tip.label), sort(ph$tip.label)) )
strs_lsd[[IDX]] <- ph
#names(strs_lsd[[IDX]]) <- FILE
}
setkey(submitted.info, IDX)
submitted.info[, WITH_LSD:= factor(sapply(strs_lsd, is.null), levels=c(TRUE,FALSE), labels=c('N','Y'))]
submitted.info <- subset(submitted.info, WITH_LSD=='Y')
#
# ladderize all trees
#
ttrs <- lapply(ttrs, ladderize)
strs <- lapply(strs, ladderize)
strs_lsd<- lapply(strs_lsd, function(ph){
if(!is.null(ph))
ph <- ladderize(ph)
ph
} )
#
# SAVE so far
#
outdir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/evaluation'
save(strs, strs_lsd, strs_rtt, ttrs, trungps, tinfo, tfiles, tbrl, submitted.info, file=file.path(outdir,'submitted_170424.rda'))
}
##--------------------------------------------------------------------------------------------------------
## olli 27.06.16
##--------------------------------------------------------------------------------------------------------
treecomparison.submissions.160627<- function()
{
require(data.table)
require(ape)
require(adephylo)
require(phangorn)
require(parallel)
#
# get true trees
#
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim_internal/freeze_July15'
tfiles <- list.files(indir, pattern='newick$', full.names=TRUE)
tfiles <- data.table( FILE_T= tfiles[ grepl('SUBSTTREE', tfiles) | grepl('Vill_99', tfiles) | grepl('Vill.*DATEDTREE', tfiles) ] )
tfiles[, SC:= toupper(gsub('_SUBSTTREE|_DATEDTREE','',gsub('.newick','',basename(FILE_T))))]
tmp <- rbind( subset(tfiles, SC=='VILL_99_APR15'), subset(tfiles, SC=='VILL_99_APR15'), subset(tfiles, SC=='VILL_99_APR15') )
set(tmp, NULL, 'SC', c('150701_VILL_SCENARIO-C','150701_VILL_SCENARIO-D','150701_VILL_SCENARIO-E'))
tfiles <- rbind(tfiles, tmp)
tmp <- list.files(indir, pattern='newick$', full.names=TRUE)
tmp <- data.table( FILE_T= tmp[ grepl('Reg.*DATEDTREE', tmp) ] )
tmp[, SC:= toupper(gsub('_SUBSTTREE|_DATEDTREE','',gsub('.newick','',basename(FILE_T))))]
tfiles <- rbind(tfiles, tmp)
tfiles[, BRL_T:= 'time']
set(tfiles, tfiles[, which(grepl('REG',SC) & grepl('SUBST',FILE_T))], 'BRL_T', 'subst')
ttrs <- lapply(tfiles[, FILE_T], function(x) read.tree(file=x) )
names(ttrs) <- tfiles[, SC]
for(z in c('VILL_99_APR15','150701_VILL_SCENARIO-C','150701_VILL_SCENARIO-D','150701_VILL_SCENARIO-E'))
ttrs[[z]] <- root(ttrs[[z]], node=Ntip(ttrs[[z]])+2, resolve.root=1)
tfiles[, IDX_T:=seq_along(ttrs)]
tfiles[, TAXAN_T:= sapply(ttrs, Ntip)]
# patristic distances on true trees (by time and subst/site)
tbrl <- lapply(seq_len(nrow(tfiles)), function(i)
{
ph <- ttrs[[tfiles[i, IDX_T]]]
tmp <- cophenetic.phylo(ph)
tmp <- as.data.table(melt(tmp))
setnames(tmp, c('Var1','Var2','value'),c('TAXA1','TAXA2','PD'))
tmp <- subset(tmp, TAXA1!=TAXA2)
tmp[, IDX_T:= tfiles[i, IDX_T]]
tmp[, SC:= tfiles[i, SC]]
tmp[, BRL_T:= tfiles[i, BRL_T]]
tmp[, TAXAN_T:= tfiles[i, TAXAN_T]]
tmp
})
tbrl <- do.call('rbind',tbrl)
# info on true trees
tinfo <- merge(tfiles, do.call('rbind',lapply(seq_along(ttrs), function(i) data.table(TAXA=ttrs[[i]]$tip.label, IDX_T=i))), by='IDX_T')
tinfo[, IDPOP:=NA_character_]
tmp <- tinfo[, which(grepl('REGIONAL',SC))]
set(tinfo, tmp, 'IDPOP', tinfo[tmp,regmatches(TAXA, regexpr('IDPOP_[0-9]+',TAXA))])
tmp <- tinfo[, which(!grepl('REGIONAL',SC))]
set(tinfo, tmp, 'IDPOP', tinfo[tmp, regmatches(TAXA, regexpr('HOUSE[0-9]+-[0-9]+|House[0-9]+-[0-9]+',TAXA))])
stopifnot(subset(tinfo, grepl('VILL',SC))[, length(which(substring(TAXA,1,10)!=substring(IDPOP,1,10)))]==0)
stopifnot( tinfo[, length(which(is.na(IDPOP)))==0] )
set(tinfo, NULL, 'IDPOP', tinfo[,toupper(IDPOP)])
set(tinfo, NULL, 'TAXA', tinfo[,toupper(TAXA)])
# read cluster membership from DATEDCLUTREES
tmp <- list.files(indir, pattern='DATEDCLUTREES', full.names=TRUE)
tmp <- rbind( data.table( FILE_CLU_T= tmp,
SC= toupper(gsub('_DATEDCLUTREES','',gsub('.newick','',basename(tmp)))),
BRL_T= 'time'),
data.table( FILE_CLU_T= tmp,
SC= toupper(gsub('_DATEDCLUTREES','',gsub('.newick','',basename(tmp)))),
BRL_T= 'subst') )
tfiles <- merge(tfiles, tmp, by=c('SC','BRL_T'), all=1)
tmp <- subset(tfiles, !is.na(FILE_CLU_T))[, {
z <- read.tree(FILE_CLU_T)
do.call('rbind',lapply(seq_along(z), function(i) data.table(IDCLU=i, TAXA=z[[i]]$tip.label)))
}, by=c('SC','BRL_T')]
tinfo <- merge(tinfo, tmp, by=c('SC','BRL_T','TAXA'), all=1)
tmp <- subset(tinfo, !is.na(IDCLU))[, list(CLU_N= length(IDPOP)), by=c('SC','BRL_T','IDCLU')]
tinfo <- merge(tinfo, tmp, by=c('SC','BRL_T','IDCLU'), all=1)
# read sequences and determine %gappiness
tmp <- list.files(indir, pattern='fa$|fasta$', full.names=TRUE)
tmp <- data.table( FILE_SEQ_T= tmp, SC= toupper(gsub('_SIMULATED','',gsub('.fa','',basename(tmp)))))
z <- subset(tmp, SC=='VILL_99_APR15')
set(z, NULL, 'SC', '150701_VILL_SCENARIO-C')
tmp <- rbind( tmp, z )
tfiles <- merge(tfiles, tmp, by='SC', all=1)
tmp <- subset(tfiles, !is.na(FILE_SEQ_T))[, {
z <- read.dna(FILE_SEQ_T, format='fasta')
ans <- sapply(seq_len(nrow(z)), function(i) base.freq(z[i,], all=1))
ans <- apply(ans[c('n','-','?'),], 2, sum)
list(TAXA=rownames(z), GPS=ans)
}, by=c('SC','BRL_T')]
tinfo <- merge(tinfo, tmp, by=c('SC','BRL_T','TAXA'), all.x=1)
# add % gaps by gene for regional
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/simulations'
infiles <- data.table(FILE=list.files(indir, full.names=1, pattern='_TRAIN[0-9]+_SIMULATED.fa$'))
infiles[, SC:= toupper(gsub('_SIMULATED.fa','',basename(FILE)))]
tmp <- infiles[, {
#FILE<- "/Users/Oliver/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/simulations/150701_Regional_TRAIN4_SIMULATED.fa"
sq <- read.dna(FILE, format='fa')
seqi<- as.data.table(read.csv(gsub('.fa','_gene.txt',FILE), header=FALSE))
seqi[, GENE:= regmatches(V2, regexpr('[a-z]+', V2))]
seqi[, START:= as.integer(gsub('-','',regmatches(V2, regexpr('[0-9]+-', V2))))]
seqi[, END:= as.integer(gsub('-','',regmatches(V2, regexpr('-[0-9]+', V2))))]
seqi <- subset(seqi, select=c(GENE,START,END))
seqi <- rbind(seqi, data.table(GENE='full', START=1L, END= seqi[, max(END)]))
ans <- seqi[, {
#START<- 1474; END<- 5466
z <- as.character(sq[,START:END])
tmp <- apply(z, 2, function(x) all(x%in%c('-','?')))
z <- z[, !tmp]
tmp <- apply(z, 1, function(x) length(which(x=='?'))) / ncol(z)
list(TAXA=names(tmp), GAPS_P=tmp)
}, by='GENE']
set(ans, NULL, 'GENE', ans[, paste(toupper(GENE),'_GAPS_P',sep='')])
ans <- dcast.data.table(ans, TAXA~GENE, value.var='GAPS_P')
ans
}, by='SC']
tmp <- tmp[, list(FULL_GAPS_P=mean(FULL_GAPS_P), GAG_GAPS_P=mean(GAG_GAPS_P), POL_GAPS_P=mean(POL_GAPS_P), ENV_GAPS_P=mean(ENV_GAPS_P)), by='SC']
tinfo <- merge(tinfo, tmp, all.x=1, by='SC')
#
# to tinfo add actual transmitters
#
# check TRAIN1
load( '/Users/Oliver/Dropbox (SPH Imperial College)/PANGEAHIVsim_internal/freeze_July15/150701_Regional_TRAIN1_SIMULATED_INTERNAL.R' )
ch <- subset(tinfo, SC=='150701_REGIONAL_TRAIN1' & BRL_T=='time', TAXA)
ch[, IDPOP:= as.integer(gsub('IDPOP_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',1)))]
ch[, GENDER_CH:= sapply(strsplit(TAXA,'|',fixed=1),'[[',2)]
ch[, DOB_CH:= as.numeric(gsub('DOB_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',3)))]
ch[, TIME_SEQ_CH:= as.numeric(sapply(strsplit(TAXA,'|',fixed=1),'[[',4))]
ch <- merge(subset(df.inds, select=c(IDPOP, GENDER, DOB, TIME_SEQ)), ch, by='IDPOP')
stopifnot( ch[, all(DOB==DOB_CH)], ch[, all(GENDER==GENDER_CH)], ch[, all(TIME_SEQ==TIME_SEQ_CH)] )
subset(ch, TIME_SEQ!=TIME_SEQ_CH)
# OK :-) schedule adding IDPOP_T
df.trms[, IDPOP:= df.trms[, paste('IDPOP_',IDREC,sep='')]]
tinfo.add <- subset(df.trms, select=c(IDPOP, IDTR))
df.trms[, IDPOP:= df.trms[, paste('IDPOP_',IDTR,sep='')]]
tinfo.add <- merge(tinfo.add, subset(df.trms, select=c(IDPOP, IDREC)), by='IDPOP', all=1)
set(ch, NULL, 'IDPOP', ch[, paste('IDPOP_',IDPOP,sep='')])
tinfo.add <- merge(tinfo.add, subset(ch, select=IDPOP), by='IDPOP')
tinfo.add[, SC:='150701_REGIONAL_TRAIN1']
# check TRAIN2
load( '/Users/Oliver/Dropbox (SPH Imperial College)/PANGEAHIVsim_internal/freeze_July15/150701_Regional_TRAIN2_SIMULATED_INTERNAL.R' )
ch <- subset(tinfo, SC=='150701_REGIONAL_TRAIN2' & BRL_T=='time', TAXA)
ch[, IDPOP:= as.integer(gsub('IDPOP_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',1)))]
ch[, GENDER_CH:= sapply(strsplit(TAXA,'|',fixed=1),'[[',2)]
ch[, DOB_CH:= as.numeric(gsub('DOB_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',3)))]
ch[, TIME_SEQ_CH:= as.numeric(sapply(strsplit(TAXA,'|',fixed=1),'[[',4))]
ch <- merge(subset(df.inds, select=c(IDPOP, GENDER, DOB, TIME_SEQ)), ch, by='IDPOP')
stopifnot( ch[, all(DOB==DOB_CH)], ch[, all(GENDER==GENDER_CH)], ch[, all(TIME_SEQ==TIME_SEQ_CH)] )
subset(ch, TIME_SEQ!=TIME_SEQ_CH)
# OK :-) schedule adding IDPOP_T
df.trms[, IDPOP:= df.trms[, paste('IDPOP_',IDREC,sep='')]]
tmp <- subset(df.trms, select=c(IDPOP, IDTR))
df.trms[, IDPOP:= df.trms[, paste('IDPOP_',IDTR,sep='')]]
tmp <- merge(tmp, subset(df.trms, select=c(IDPOP, IDREC)), by='IDPOP', all=1)
set(ch, NULL, 'IDPOP', ch[, paste('IDPOP_',IDPOP,sep='')])
tmp <- merge(tmp, subset(ch, select=IDPOP), by='IDPOP')
tmp[, SC:='150701_REGIONAL_TRAIN2']
tinfo.add <- rbind(tinfo.add, tmp)
# check TRAIN4
ch <- subset(tinfo, SC=='150701_REGIONAL_TRAIN4' & BRL_T=='time', TAXA)
ch[, IDPOP:= as.integer(gsub('IDPOP_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',1)))]
ch[, GENDER_CH:= sapply(strsplit(TAXA,'|',fixed=1),'[[',2)]
ch[, DOB_CH:= as.numeric(gsub('DOB_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',3)))]
ch[, TIME_SEQ_CH:= as.numeric(sapply(strsplit(TAXA,'|',fixed=1),'[[',4))]
ch <- merge(subset(df.inds, select=c(IDPOP, GENDER, DOB, TIME_SEQ)), ch, by='IDPOP')
stopifnot( ch[, all(DOB==DOB_CH)], ch[, all(GENDER==GENDER_CH)], ch[, all(TIME_SEQ==TIME_SEQ_CH)] )
subset(ch, TIME_SEQ!=TIME_SEQ_CH)
tmp[, SC:='150701_REGIONAL_TRAIN4']
tinfo.add <- rbind(tinfo.add, tmp)
# check TRAIN3
load( '/Users/Oliver/Dropbox (SPH Imperial College)/PANGEAHIVsim_internal/freeze_July15/150701_Regional_TRAIN3_SIMULATED_INTERNAL.R' )
ch <- subset(tinfo, SC=='150701_REGIONAL_TRAIN3' & BRL_T=='time', TAXA)
ch[, IDPOP:= as.integer(gsub('IDPOP_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',1)))]
ch[, GENDER_CH:= sapply(strsplit(TAXA,'|',fixed=1),'[[',2)]
ch[, DOB_CH:= as.numeric(gsub('DOB_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',3)))]
ch[, TIME_SEQ_CH:= as.numeric(sapply(strsplit(TAXA,'|',fixed=1),'[[',4))]
ch <- merge(subset(df.inds, select=c(IDPOP, GENDER, DOB, TIME_SEQ)), ch, by='IDPOP')
stopifnot( ch[, all(DOB==DOB_CH)], ch[, all(GENDER==GENDER_CH)], ch[, all(TIME_SEQ==TIME_SEQ_CH)] )
subset(ch, TIME_SEQ!=TIME_SEQ_CH)
# OK :-) schedule adding IDPOP_T
df.trms[, IDPOP:= df.trms[, paste('IDPOP_',IDREC,sep='')]]
tmp <- subset(df.trms, select=c(IDPOP, IDTR))
df.trms[, IDPOP:= df.trms[, paste('IDPOP_',IDTR,sep='')]]
tmp <- merge(tmp, subset(df.trms, select=c(IDPOP, IDREC)), by='IDPOP', all=1)
set(ch, NULL, 'IDPOP', ch[, paste('IDPOP_',IDPOP,sep='')])
tmp <- merge(tmp, subset(ch, select=IDPOP), by='IDPOP')
tmp[, SC:='150701_REGIONAL_TRAIN3']
tinfo.add <- rbind(tinfo.add, tmp)
# check TRAIN5
ch <- subset(tinfo, SC=='150701_REGIONAL_TRAIN5' & BRL_T=='time', TAXA)
ch[, IDPOP:= as.integer(gsub('IDPOP_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',1)))]
ch[, GENDER_CH:= sapply(strsplit(TAXA,'|',fixed=1),'[[',2)]
ch[, DOB_CH:= as.numeric(gsub('DOB_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',3)))]
ch[, TIME_SEQ_CH:= as.numeric(sapply(strsplit(TAXA,'|',fixed=1),'[[',4))]
ch <- merge(subset(df.inds, select=c(IDPOP, GENDER, DOB, TIME_SEQ)), ch, by='IDPOP')
stopifnot( ch[, all(DOB==DOB_CH)], ch[, all(GENDER==GENDER_CH)], ch[, all(TIME_SEQ==TIME_SEQ_CH)] )
subset(ch, TIME_SEQ!=TIME_SEQ_CH)
tmp[, SC:='150701_REGIONAL_TRAIN5']
tinfo.add <- rbind(tinfo.add, tmp)
#
# add transmitters for regional to tinfo
#
tinfo <- merge(tinfo, tinfo.add, by=c('IDPOP', 'SC'), all.x=1)
#
# add true node depths to tinfo
#
tmp <- tinfo[, {
cat(IDX_T,'\n')
ph<- ttrs[[IDX_T]]
list(DEPTH_T=node.depth.edgelength(ph)[seq_len(Ntip(ph))], TAXA=ph$tip.label)
}, by='IDX_T']
tinfo <- merge(tinfo, tmp, by=c('IDX_T','TAXA'), all.x=1)
#
# compute closest individual on true trees
#
tmp <- unique(subset(tinfo, select=c(SC, BRL_T, IDX_T)))
tmp <- tmp[, {
print(IDX_T)
ph <- ttrs[[IDX_T]]
model.reg <- grepl('REGIONAL',SC)
treedist.closest.ind(ph, model.reg)
}, by=c('SC','BRL_T','IDX_T')]
tinfo <- merge(tinfo, tmp, by=c('SC','BRL_T','IDX_T','IDPOP'))
set(tinfo, NULL, 'IDPOP_CL', tinfo[, gsub('IDPOP_','',IDPOP_CL)])
#
# add if transmitter sampled
#
tmp <- subset(tinfo, grepl('REGIONAL',SC))
set(tmp, NULL, 'IDPOP', tmp[,as.integer(gsub('IDPOP_','',IDPOP))])
setkey(tmp, IDX_T, IDPOP)
tmp <- unique(tmp)[, {
z <- IDX_T
list(IDTR_SAMPLED=ifelse(IDTR%in%subset(tmp, IDX_T==z)[['IDPOP']], 'Y', 'N'))
}, by=c('IDX_T','IDPOP')]
set(tmp, NULL, 'IDPOP', tmp[, paste('IDPOP_',IDPOP,sep='')])
tinfo <- merge(tinfo, tmp, by=c('IDX_T','IDPOP'),all.x=1)
#
# get submitted trees
#
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/IQTree/IQTree201507'
infiles <- list.files(indir, pattern='treefile$', recursive=1, full.names=1)
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/IQTree/IQTree201510'
infiles <- c(infiles, list.files(indir, pattern='treefile$', recursive=1, full.names=1))
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/IQTree/IQTREE_Update_gag'
infiles <- c(infiles, list.files(indir, pattern='treefile$', recursive=1, full.names=1))
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/PhyML'
infiles <- c(infiles, list.files(indir, pattern='*tree*', recursive=1, full.names=1))
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/RAxML'
infiles <- c(infiles, list.files(indir, pattern='*RAxML_bestTree*', recursive=1, full.names=1))
infiles <- c(infiles, list.files(indir, pattern="best_tree", recursive=1, full.names=1))
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/running_gaps'
infiles <- c(infiles, list.files(indir, pattern="newick", recursive=1, full.names=1))
infiles <- data.table(FILE=infiles)
strs <- lapply(infiles[, FILE], function(x)
{
cat(x)
read.tree(file=x)
})
names(strs) <- infiles[, FILE]
#
# add MetaPIGA full genome trees, version 160713. stored as list of newicks
#
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/MetaPIGA_160713'
tmp <- list.files(indir, pattern='*txt*', recursive=1, full.names=1)
tmp.trees <- lapply(tmp, function(x)
{
cat(x)
read.tree(file=x)
})
MetaPIGA.trees <- c(tmp.trees[[1]],tmp.trees[[2]],tmp.trees[[3]],tmp.trees[[4]],tmp.trees[[5]],tmp.trees[[6]])
tmp <- sapply( seq_along(tmp), function(i) paste(gsub('.txt','',tmp[i]), '_tree', seq_along(tmp.trees[[i]]), sep='') )
names(MetaPIGA.trees) <- unlist(tmp)
strs <- c(strs, unclass(MetaPIGA.trees))
#
# to keep old index, add MVR trees now
#
options(warn=2)
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/tree_mvr'
tmp <- data.table(FILE=list.files(indir, pattern="newick", recursive=1, full.names=1))
mvrtrs <- lapply(tmp[, FILE], function(x)
{
cat(x)
read.tree(file=x)
})
names(mvrtrs) <- tmp[, FILE]
strs <- c(strs, mvrtrs)
#
# add MetaPIGA trees, version 150831. stored as nexus
#
#indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/MetaPIGA_150831'
#tmp <- list.files(indir, pattern='*result*', recursive=1, full.names=1)
#tmp <- data.table(FILE=tmp)
#tmp.trees <- lapply(tmp[, FILE], function(x)
# {
# cat(x)
# read.nexus(file=x)
# })
#sapply(tmp.trees, length)
#MetaPIGA.trees <- c(lapply(tmp.trees, '[[', 1), lapply(tmp.trees, '[[', 2), lapply(tmp.trees, '[[', 3), lapply(tmp.trees, '[[', 4))
#names(MetaPIGA.trees) <- c(sapply(tmp.trees, function(x) paste(names(x)[1],'_use',sep='')), sapply(tmp.trees, function(x) names(x)[2]), sapply(tmp.trees, function(x) names(x)[3]), sapply(tmp.trees, function(x) names(x)[4]))
#names(MetaPIGA.trees) <- gsub("'",'',names(MetaPIGA.trees), fixed=1)
#strs <- c(strs, MetaPIGA.trees)
#
#
#
submitted.info <- data.table(FILE=names(strs))
submitted.info[, IDX:=seq_along(strs)]
#
# set team
#
submitted.info[, TEAM:=NA_character_]
set(submitted.info, submitted.info[, which(grepl('RAXML|RAxML',FILE))], 'TEAM', 'RAXML')
set(submitted.info, submitted.info[, which(grepl('IQTree',FILE))], 'TEAM', 'IQTree')
set(submitted.info, submitted.info[, which(grepl('MetaPIGA|Consensus pruning|Best individual of population',FILE))], 'TEAM', 'MetaPIGA')
set(submitted.info, submitted.info[, which(grepl('PhyML',FILE))], 'TEAM', 'PhyML')
set(submitted.info, submitted.info[, which(grepl('running_gaps',FILE))], 'TEAM', 'RUNGAPS_ExaML')
set(submitted.info, submitted.info[, which(grepl('tree_mvr.*MVR_C_0\\.newick',FILE))], 'TEAM', 'MVR')
set(submitted.info, submitted.info[, which(grepl('tree_mvr.*BioNJ_C_0\\.newick',FILE))], 'TEAM', 'BioNJ')
stopifnot( submitted.info[, length(which(is.na(TEAM)))==0] )
#
# scenario
#
submitted.info[, SC:=NA_character_]
tmp <- submitted.info[, which(grepl('150701_Regional_TRAIN[0-9]', FILE))]
set(submitted.info, tmp, 'SC', submitted.info[tmp, regmatches(FILE, regexpr('150701_Regional_TRAIN[0-9]',FILE))])
tmp <- submitted.info[, which(grepl('150701_Vill_SCENARIO-[A-Z]', FILE))]
set(submitted.info, tmp, 'SC', submitted.info[tmp, regmatches(FILE, regexpr('150701_Vill_SCENARIO-[A-Z]',FILE))])
tmp <- submitted.info[, which(is.na(SC) & grepl('TRAIN[0-9]', FILE))]
set(submitted.info, tmp, 'SC', submitted.info[tmp, paste('150701_Regional_',regmatches(FILE, regexpr('TRAIN[0-9]',FILE)),sep='')])
tmp <- submitted.info[, which(is.na(SC) & grepl('scenario[A-Z]', FILE))]
set(submitted.info, tmp, 'SC', submitted.info[tmp, paste('150701_Vill_',regmatches(FILE, regexpr('scenario[A-Z]',FILE)),sep='')])
tmp <- submitted.info[, which(is.na(SC) & grepl('150701_regional_train[0-9]', FILE))]
set(submitted.info, tmp, 'SC', submitted.info[tmp, regmatches(FILE, regexpr('150701_regional_train[0-9]',FILE))])
tmp <- submitted.info[, which(is.na(SC) & grepl('150701_vill_scenario-[A-Z]', FILE))]
set(submitted.info, tmp, 'SC', submitted.info[tmp, regmatches(FILE, regexpr('150701_vill_scenario-[A-Z]',FILE))])
tmp <- submitted.info[, which(is.na(SC) & grepl('Vill_99_Apr15', FILE))]
set(submitted.info, tmp, 'SC', 'Vill_99_Apr15')
set(submitted.info, NULL, 'SC', submitted.info[, toupper(SC)])
tmp <- submitted.info[, which(grepl('150701_VILL_SCENARIO[A-Z]', SC))]
set(submitted.info, tmp, 'SC', submitted.info[tmp, gsub('150701_VILL_SCENARIO','150701_VILL_SCENARIO-',SC)])
stopifnot( submitted.info[, length(which(is.na(SC)))==0] )
#
# define running gaps for the running gaps analysis
#
submitted.info[, RUNGGAPS:=NA_real_]
tmp <- submitted.info[, which(TEAM=='RUNGAPS_ExaML')]
set(submitted.info, tmp, 'RUNGAPS', submitted.info[tmp, as.numeric(gsub('TRAIN[0-9]','',regmatches(FILE,regexpr('TRAIN[0-9]+',FILE))))/100])
#
# set covariates of scenarios
#
tmp <- data.table( SC= c("150701_REGIONAL_TRAIN1","150701_REGIONAL_TRAIN2","150701_REGIONAL_TRAIN3","150701_REGIONAL_TRAIN4" ,"150701_REGIONAL_TRAIN5", "150701_VILL_SCENARIO-A", "150701_VILL_SCENARIO-B", "VILL_99_APR15","150701_VILL_SCENARIO-C", "150701_VILL_SCENARIO-D", "150701_VILL_SCENARIO-E"),
MODEL= c('R','R','R','R','R','V','V','V','V','V','V'),
SEQCOV= c(0.16, 0.16, 0.16, 0.16, 0.16, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6),
ACUTE= c('low', 'low', 'high', 'low', 'high', 'high', 'high', 'high', 'high', 'high', 'high'),
GAPS= c('none', 'low', 'low', 'high', 'high', 'low', 'high', 'none', 'none', 'low', 'high'),
ART= c('none', 'none', 'none', 'none', 'none', 'none', 'none', 'fast', 'fast', 'fast', 'fast'),
EXT= c('5pc', '5pc', '5pc', '5pc', '5pc', '~0pc', '~0pc', '~0pc', '~0pc', '~0pc', '~0pc') )
submitted.info <- merge(submitted.info, tmp, by='SC')
#
# set which gene used to construct tree (either pol or concatenated gag+pol+env)
#
submitted.info[, GENE:=NA_character_]
set(submitted.info, submitted.info[, which(TEAM=='RAXML' & grepl('full', FILE))], 'GENE', 'GAG+POL+ENV')
set(submitted.info, submitted.info[, which(TEAM=='RAXML' & grepl('pol', FILE))], 'GENE', 'POL')
set(submitted.info, submitted.info[, which(TEAM=='RAXML' & grepl('gag', FILE))], 'GENE', 'GAG')
stopifnot(nrow(subset(submitted.info, TEAM=='RAXML' & is.na(GENE)))==0)
set(submitted.info, submitted.info[, which(TEAM=='PhyML' & grepl('gag', FILE))], 'GENE', 'GAG')
set(submitted.info, submitted.info[, which(TEAM=='PhyML' & grepl('pol', FILE))], 'GENE', 'POL')
set(submitted.info, submitted.info[, which(TEAM=='PhyML' & grepl('gagpolenv', FILE))], 'GENE', 'GAG+POL+ENV')
stopifnot(nrow(subset(submitted.info, TEAM=='RAXML' & is.na(GENE)))==0)
set(submitted.info, submitted.info[, which(TEAM=='MetaPIGA' & grepl('gag', FILE))], 'GENE', 'GAG')
set(submitted.info, submitted.info[, which(TEAM=='MetaPIGA' & grepl('all', FILE))], 'GENE', 'GAG+POL+ENV')
stopifnot(nrow(subset(submitted.info, TEAM=='MetaPIGA' & is.na(GENE)))==0)
set(submitted.info, submitted.info[, which(TEAM=='IQTree' & grepl('[0-9]_partition', FILE))], 'GENE', 'GAG+POL+ENV')
set(submitted.info, submitted.info[, which(TEAM=='IQTree' & grepl('[0-9]_pol_partition', FILE))], 'GENE', 'POL')
set(submitted.info, submitted.info[, which(TEAM=='IQTree' & grepl('[0-9]_gag_partition', FILE))], 'GENE', 'GAG')
stopifnot(nrow(subset(submitted.info, TEAM=='IQTree' & is.na(GENE)))==0)
set(submitted.info, submitted.info[, which(TEAM=='RUNGAPS_ExaML' & grepl('FULL_SIMULATED', FILE))], 'GENE', 'GAG+POL+ENV')
set(submitted.info, submitted.info[, which(TEAM=='RUNGAPS_ExaML' & grepl('GAG_SIMULATED', FILE))], 'GENE', 'GAG')
set(submitted.info, submitted.info[, which(TEAM=='RUNGAPS_ExaML' & grepl('GAGPP_SIMULATED', FILE))], 'GENE', 'GAG+PARTIALPOL')
set(submitted.info, submitted.info[, which(TEAM=='RUNGAPS_ExaML' & grepl('P17_SIMULATED', FILE))], 'GENE', 'P17')
stopifnot(nrow(subset(submitted.info, TEAM=='RUNGAPS_ExaML' & is.na(GENE)))==0)
set(submitted.info, submitted.info[, which(TEAM%in%c('MVR','BioNJ') & grepl('gag+pol+env', FILE))], 'GENE', 'GAG+POL+ENV')
set(submitted.info, submitted.info[, which(TEAM%in%c('MVR','BioNJ') & grepl('pol', FILE))], 'GENE', 'POL')
set(submitted.info, submitted.info[, which(TEAM%in%c('MVR','BioNJ') & grepl('gag', FILE))], 'GENE', 'GAG')
set(submitted.info, submitted.info[, which(TEAM%in%c('MVR','BioNJ') & grepl('env', FILE))], 'GENE', 'ENV')
stopifnot(nrow(subset(submitted.info, TEAM%in%c('MVR','BioNJ') & is.na(GENE)))==0)
#
# best tree for each scenario
#
submitted.info[, BEST:='N']
set(submitted.info, submitted.info[, which(grepl('RAxML', FILE) & grepl('best_tree', FILE))], 'BEST', 'Y')
# copied from ListOfBestTrees_IQTree150818.txt
# there are several best trees for some scenarios
tmp <- c( '150701_Vill_SCENARIO-A_IQTree150814_partition_12_3_07',
'150701_Vill_SCENARIO-A_IQTree150814_partition_12_3_04.',
'150701_Vill_SCENARIO-B_IQTree150814_partition_12_3_03.',
'Vill_99_Apr15_IQTree150814_partition_123.',
'150701_Vill_SCENARIO-D_IQTree150814_partition_12_3.',
'150701_Vill_SCENARIO-E_IQTree150814_partition_12_3.',
'150701_Vill_SCENARIO-A_IQTree150814_pol_partition_12_3.',
'150701_Vill_SCENARIO-B_IQTree150814_pol_partition_12_3_05.',
'Vill_99_Apr15_IQTree150814_pol_partition_12_3_09.',
'Vill_99_Apr15_IQTree150814_pol_partition_12_3_10.',
'150701_Vill_SCENARIO-D_IQTree150814_pol_partition_12_3_05.',
'150701_Vill_SCENARIO-D_IQTree150814_pol_partition_12_3_06.',
'150701_Vill_SCENARIO-D_IQTree150814_pol_partition_12_3_09.',
'150701_Vill_SCENARIO-E_IQTree150814_pol_partition_12_3_06.',
'150701_Regional_TRAIN1_IQTree150818_partition_123_03.',
'150701_Regional_TRAIN1_IQTree150818_pol_partition_123_05.')
tmp <- sapply(tmp, function(x) submitted.info[, which((grepl('IQTree150814/', FILE, fixed=1) | grepl('IQTree150818/', FILE, fixed=1)) & grepl(x, FILE, fixed=1))] )
set(submitted.info, tmp, 'BEST', 'Y')
tmp <- c( '150701_Regional_TRAIN2_IQTree151019_partition_123_10',
'150701_Regional_TRAIN3_IQTree151019_partition_123_03',
'150701_Regional_TRAIN4_IQTree151019_partition_123_10',
'150701_Regional_TRAIN5_IQTree151019_partition_123_01',
'150701_Regional_TRAIN2_IQTree151019_pol_partition_123_08',
'150701_Regional_TRAIN3_IQTree151019_pol_partition_123_08',
'150701_Regional_TRAIN4_IQTree151019_pol_partition_123_05',
'150701_Regional_TRAIN5_IQTree151019_pol_partition_123_10')
tmp <- sapply(tmp, function(x) submitted.info[, which((grepl('IQTree151019', FILE, fixed=1)) & grepl(x, FILE, fixed=1))] )
set(submitted.info, tmp, 'BEST', 'Y')
tmp <- c( '150701_Regional_TRAIN1_IQTree160530_gag_partition_123_09',
'150701_Regional_TRAIN2_IQTree160530_gag_partition_123_06',
'150701_Regional_TRAIN4_IQTree160530_gag_partition_123_02')
tmp <- sapply(tmp, function(x) submitted.info[, which((grepl('IQTREE_Update_gag', FILE, fixed=1)) & grepl(x, FILE, fixed=1))] )
set(submitted.info, tmp, 'BEST', 'Y')
# for RUNGAPS_ExaML we only have one tree per gap coverage, so all are 'best'
set(submitted.info, submitted.info[, which(TEAM=='RUNGAPS_ExaML')], 'BEST', 'Y')
# PhyML: read log likelihood
lkl <- data.table(FILE= list.files('~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/PhyML', pattern='*phyml_stats*', recursive=1, full.names=1))
lkl <- lkl[, {
z <- readLines(FILE)
z <- as.numeric( gsub('\\s','',gsub('Log-likelihood:','',gsub('^\\.','',z[ which( grepl('Log-likelihood', z) ) ]))) )
list( LOGLKL=z )
}, by='FILE']
lkl[, SC:=NA_character_]
tmp <- lkl[, which(is.na(SC) & grepl('150701_regional_train[0-9]', FILE))]
set(lkl, tmp, 'SC', lkl[tmp, regmatches(FILE, regexpr('150701_regional_train[0-9]',FILE))])
tmp <- lkl[, which(is.na(SC) & grepl('150701_vill_scenario-[A-Z]', FILE))]
set(lkl, tmp, 'SC', lkl[tmp, regmatches(FILE, regexpr('150701_vill_scenario-[A-Z]',FILE))])
set(lkl, NULL, 'SC', lkl[, toupper(SC)])
lkl[, TEAM:= 'PhyML']
lkl[, GENE:= NA_character_]
set(lkl, lkl[, which(TEAM=='PhyML' & grepl('gag', FILE))], 'GENE', 'GAG')
set(lkl, lkl[, which(TEAM=='PhyML' & grepl('pol', FILE))], 'GENE', 'POL')
set(lkl, lkl[, which(TEAM=='PhyML' & grepl('gagpolenv', FILE))], 'GENE', 'GAG+POL+ENV')
setkey(lkl, GENE, SC)
tmp <- lkl[, list( FILE_BEST=FILE[ which.max(LOGLKL)[1] ] ), by=c('GENE','SC','TEAM')]
set(tmp, NULL, 'FILE_BEST', tmp[, gsub('phyml_stats','phyml_tree',FILE_BEST)])
set(submitted.info, submitted.info[, which(FILE%in%tmp$FILE_BEST)], 'BEST', 'Y')
#
# set OTHER (ie old or some preliminary/unknown tree)
#
submitted.info[, OTHER:='N']
# all MetaPIGA trees in 'MetaPIGA_150831' are old
set(submitted.info, submitted.info[, which(TEAM=='MetaPIGA' & !grepl('MetaPIGA',FILE))], 'OTHER', 'Y')
# IQTree did several uploads, use only most recent in main analysis
set(submitted.info, submitted.info[, which(grepl('150701_Regional_TRAIN1_IQTree150814', FILE))], 'OTHER', 'Y')
set(submitted.info, submitted.info[, which(TEAM=='IQTree' & MODEL=='R' & !grepl('TRAIN1', SC) & grepl('201507/',FILE,fixed=1))], 'OTHER', 'Y')
# RAxML gag_1606 are old
set(submitted.info, submitted.info[, which(TEAM=='RAXML' & grepl('gag_gene_1606',FILE))], 'OTHER', 'Y')
#
# add BRL_UNITS
#
submitted.info[, BRL:='subst']
#
# number taxa in tree
#
setkey(submitted.info, IDX)
submitted.info[, TAXAN:= sapply(strs, Ntip)]
#
# add index of true tree
#
require(phangorn)
tmp <- subset(tfiles, select=c('SC','IDX_T','BRL_T'))
tmp <- dcast.data.table(tmp, SC~BRL_T, value.var='IDX_T')
setnames(tmp, c('subst','time'), c("SUB_IDX_T","TIME_IDX_T"))
submitted.info <- merge(submitted.info, tmp, by='SC')
submitted.info <- merge(submitted.info, unique(subset(tfiles, select=c('SC','TAXAN_T'))), by='SC')
stopifnot(nrow(subset(submitted.info, TAXAN>TAXAN_T))==0)
#
# fix taxa names that teams have changed
#
tmp <- subset(submitted.info, TEAM=='IQTree' & MODEL=='R')[, IDX]
for(i in tmp)
{
strs[[i]]$tip.label <- sapply(strsplit(strs[[i]]$tip.label,'_'), function(x) paste(x[1],'_',x[2],'|',x[3],'|',x[4],'_',x[5],'|',x[6],sep='') )
}
for(i in seq_along(strs))
{
strs[[i]]$tip.label <- toupper(strs[[i]]$tip.label)
}
for(i in seq_along(ttrs))
{
ttrs[[i]]$tip.label <- toupper(ttrs[[i]]$tip.label)
}
tmp2 <- subset(tinfo, BRL_T=='time', select=c(IDPOP,SC,TAXA))
setkey(tmp2, IDPOP,SC,TAXA)
tmp2 <- unique(tmp2)
tmp <- subset(submitted.info, TEAM=='PhyML' & MODEL=='R')[, IDX]
for(i in tmp)
{
cat(i,'\n')
z <- data.table(IDX=seq_along(strs[[i]]$tip.label), IDPOP=regmatches(strs[[i]]$tip.label, regexpr('IDPOP_[0-9]+',strs[[i]]$tip.label)), SC=subset(submitted.info, IDX==i)[,SC])
z <- merge(tmp2, z, by=c('IDPOP','SC'))
setkey(z, IDX)
stopifnot(nrow(z)==Ntip(strs[[i]]))
strs[[i]]$tip.label <- z[, TAXA]
}
tmp <- subset(submitted.info, TEAM=='PhyML' & MODEL=='V')[, IDX]
for(i in tmp)
{
cat(i,'\n')
z <- data.table(IDX=seq_along(strs[[i]]$tip.label), IDPOP=regmatches(strs[[i]]$tip.label, regexpr('HOUSE[0-9]+-[0-9]+|House[0-9]+-[0-9]+',strs[[i]]$tip.label)), SC=subset(submitted.info, IDX==i)[,SC])
z <- merge(tmp2, z, by=c('IDPOP','SC'))
stopifnot(nrow(z)==length(strs[[i]]$tip.label))
setkey(z, IDX)
strs[[i]]$tip.label <- z[, TAXA]
}
#
# check labels and remove labels that do not appear in the observed tree
#
tmp <- submitted.info[, {
stree <- unroot(strs[[IDX]])
otree <- unroot(ttrs[[TIME_IDX_T]])
z <- setdiff(otree$tip.label, stree$tip.label)
list(CHECK= length(z)==abs(diff(c(TAXAN, TAXAN_T))) )
}, by='IDX']
tmp <- merge(subset(tmp, !CHECK), submitted.info, by='IDX')
for(i in seq_len(nrow(tmp)))
{
j <- tmp[i, IDX]
cat('\n',j)
otree <- tmp[i, TIME_IDX_T]
stree <- unroot(strs[[j]])
otree <- unroot(ttrs[[otree]])
z <- merge( data.table(TAXA=stree$tip.label, TYPE='s'), data.table(TAXA=otree$tip.label, TYPE='o'), by='TAXA', all=1)
z[, IDPOP:= gsub('IDPOP_','',regmatches(TAXA, regexpr('IDPOP_[0-9]+',TAXA)))]
strs[[j]] <- drop.tip(stree, subset( z, is.na(TYPE.y))[, TAXA])
}
submitted.info[, TAXAN:= sapply(strs, Ntip)]
#
# are trees rooted?
#
setkey(submitted.info, IDX)
submitted.info[, ROOTED:=factor(sapply(strs, is.rooted),levels=c(TRUE,FALSE),labels=c('Y','N'))]
#
# read LSD trees
#
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/LSD'
infiles <- data.table(FILE=list.files(indir, pattern='LSD.date.newick', full.names=TRUE))
infiles[, IDX:= as.integer(gsub('IDX_','',regmatches(basename(FILE),regexpr('IDX_[0-9]+',basename(FILE)))))]
setkey(infiles, IDX)
strs_lsd <- vector('list', submitted.info[, max(IDX)])
for(i in seq_len(nrow(infiles)))
{
# i<- 439
IDX <- infiles[i,IDX]
FILE <- infiles[i,FILE]
cat('\n',IDX)
ph <- read.tree(FILE)
stopifnot( !is.null(ph) )
stopifnot( identical(sort(strs_rtt[[IDX]]$tip.label), sort(ph$tip.label)) )
strs_lsd[[IDX]] <- ph
#names(strs_lsd[[IDX]]) <- FILE
}
setkey(submitted.info, IDX)
submitted.info[, WITH_LSD:= factor(sapply(strs_lsd, is.null), levels=c(TRUE,FALSE), labels=c('N','Y'))]
#
# SAVE so far
#
outdir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/evaluation'
save(strs, strs_lsd, ttrs, tinfo, tfiles, tbrl, submitted.info, file=file.path(outdir,'submitted_160713.rda'))
}
##--------------------------------------------------------------------------------------------------------
## olli 27.06.16
##--------------------------------------------------------------------------------------------------------
treecomparison.submissions.160627.addLSDtrees<- function()
{
require(data.table)
require(ape)
require(phangorn)
wfile <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/evaluation/submitted_160713_05QD.rda'
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/LSD'
load(wfile)
infiles <- data.table(FILE=list.files(indir, pattern='LSD.date.newick', full.names=TRUE))
infiles[, IDX:= as.integer(gsub('IDX_','',regmatches(basename(FILE),regexpr('IDX_[0-9]+',basename(FILE)))))]
setkey(infiles, IDX)
strs_lsd <- vector('list', submitted.info[, max(IDX)])
for(i in seq_len(nrow(infiles)))
{
# i<- 439
IDX <- infiles[i,IDX]
FILE <- infiles[i,FILE]
cat('\n',IDX)
ph <- read.tree(FILE)
stopifnot( !is.null(ph) )
stopifnot( identical(sort(strs_rtt[[IDX]]$tip.label), sort(ph$tip.label)) )
strs_lsd[[IDX]] <- ph
#names(strs_lsd[[IDX]]) <- FILE
}
setkey(submitted.info, IDX)
submitted.info[, WITH_LSD:= factor(sapply(strs_lsd, is.null), levels=c(TRUE,FALSE), labels=c('N','Y'))]
save(strs, strs_rtt, strs_lsd, ttrs, tbrl, tinfo, tfiles, submitted.info, sclu.info, lba, file=gsub('_05QD.rda','_06LSD.rda',wfile))
str1 <- unroot(strs_rtt[[1]])
str2 <- unroot(strs_lsd[[1]])
z <- setdiff(str1$tip.label, str2$tip.label)
RF.dist(str1, str2, check.labels=TRUE)
}
##--------------------------------------------------------------------------------------------------------
## olli 27.06.16
##--------------------------------------------------------------------------------------------------------
treecomparison.submissions.161223.stuffoncluster<- function(file)
{
require(data.table)
require(ape)
require(adephylo)
require(phangorn)
load(file)
tmp[, MODEL:='R']
setnames(tmp, c('IDX','IDX_NEW'), c('IDX_OLD','IDX'))
tmp2 <- treedist.quartetdifference.clusters.wrapper(tmp, ttrs, ph_tmp, tinfo)
save(tmp, ttrs, ph_tmp, tinfo, tmp2, file=gsub('.rda','_01extra.rda',file))
}
##--------------------------------------------------------------------------------------------------------
## olli 27.06.16
##--------------------------------------------------------------------------------------------------------
treecomparison.submissions.160627.stuffoncluster<- function(file, hpc.select=NA)
{
require(data.table)
require(ape)
require(adephylo)
require(phangorn)
load(file)
submitted.info[, HPC:= ceiling(IDX/10)]
if(!is.na(hpc.select))
{
cat('\nProcessing hpc.select=', hpc.select)
submitted.info <- subset(submitted.info, HPC==hpc.select)
file <- gsub('\\.rda',paste('_hpc',hpc.select,'.rda',sep=''),file)
}
#
# re-root simulated trees with rtt
#
options(show.error.messages = FALSE)
readAttempt <- try(suppressWarnings(load(gsub('.rda','_01rerooted.rda',file))))
options(show.error.messages = TRUE)
if( 0 & inherits(readAttempt, "try-error") )
{
options(warn=2)
strs_rtt <- vector('list', length(strs))
for(i in submitted.info[, IDX])
{
cat('\n',i)
#i <- 628 ; i<- 241; i<- 571
ph <- strs[[i]]
tmp <- data.table(TAXA=ph$tip.label)
set(tmp, NULL, 'T_SEQ', tmp[, as.numeric(regmatches(TAXA, regexpr('[0-9]*\\.[0-9]+$|[0-9]+$', TAXA))) ])
#phr <- rtt(ph, tmp[, as.numeric(T_SEQ)])
phr <- rtt(ph, tmp[, T_SEQ], ncpu=1) #this may drop a tip!
strs_rtt[[i]] <- ladderize(phr)
}
names(strs_rtt) <- names(strs)
options(warn=0)
# save intermediate
save(strs, strs_rtt, strs_lsd, ttrs, trungps, tinfo, tbrl, tfiles, submitted.info, file=gsub('.rda','_01rerooted.rda',file))
} #
options(show.error.messages = FALSE)
readAttempt <- try(suppressWarnings(load(gsub('.rda','_03RF.rda',file))))
options(show.error.messages = TRUE)
if( inherits(readAttempt, "try-error") )
{
#
# long branch attraction
#
lba <- submitted.info[, {
#IDX<-638; SUB_IDX_T<- 1
cat(IDX,'\n')
ph <- strs_rtt[[IDX]]
tmp <- data.table(DEPTH=node.depth.edgelength(ph)[seq_len(Ntip(ph))], TAXA=ph$tip.label)
tmp <- merge(tmp, unique(subset(tinfo, IDX_T==SUB_IDX_T, c(TAXA,DEPTH_T))), by='TAXA')
tmp
}, by=c('MODEL','SC','TEAM','GAPS','GENE','IDX')]
#
# compute on true trees the proportion if either transmitter or among recipients
#
tmp <- treedist.closest.ind.obs(tinfo, gd.thresh=0.01)
setnames(tmp, c('TPAIR_PHCL_T','NTPAIR_PHCL_T'), c('TPAIR_PHCL_T_1','NTPAIR_PHCL_T_1'))
submitted.info <- merge(submitted.info, tmp, by='SUB_IDX_T', all.x=1)
tinfo.pairs <- treedist.closest.ind.obs(tinfo, gd.thresh=Inf, rtn.pairs=TRUE)
setnames(tinfo.pairs, 'TRUE_PAIR','TRUE_PAIR_Inf')
#
# compute closest individual on simulated trees and determine proportion if either transmitter or among recipients
#
tmp <- treedist.closest.ind.reconstructed(submitted.info, tinfo, strs, gd.thresh=0.01)
setnames(tmp, c('TPAIR_PHCL', 'NTPAIR_PHCL'), c('TPAIR_PHCL_1', 'NTPAIR_PHCL_1'))
submitted.info <- merge(submitted.info, tmp, by='IDX', all.x=1)
tmp <- treedist.closest.ind.reconstructed.oftruepairs(submitted.info, tinfo.pairs, strs)
submitted.info <- merge(submitted.info, tmp, by='IDX', all.x=1)
#
# compute Robinson Fould of complete tree
#
tmp <- treedist.robinsonfould.wrapper(submitted.info, ttrs, strs)
submitted.info <- merge(submitted.info, tmp, by='IDX')
# compute Robinson Fould of clusters, then take sum
tmp <- treedist.robinsonfouldclusters.wrapper(submitted.info, ttrs, strs, tinfo)
sclu.info <- merge(subset(submitted.info, select=c("IDX","SC","FILE","TEAM","MODEL","SEQCOV","ACUTE","GAPS","ART","EXT","BEST","OTHER","GENE","TAXAN","ROOTED","BRL","SUB_IDX_T","TIME_IDX_T","TAXAN_T")), tmp, by='IDX')
# save intermediate
save(strs, strs_rtt, strs_lsd, ttrs, trungps, tinfo, tbrl, tfiles, submitted.info, sclu.info, lba, file=gsub('.rda','_03RF.rda',file))
}
options(show.error.messages = FALSE)
readAttempt <- try(suppressWarnings(load(gsub('.rda','_04PD.rda',file))))
options(show.error.messages = TRUE)
if( inherits(readAttempt, "try-error") )
{
#
# path distance of complete trees
#
cat('\nPath distances on rooted trees')
tmp <- treedist.pathdifference.wrapper(submitted.info, ttrs, strs_rtt, use.weight=FALSE)
tmp[, TAXA_NJ:=NULL]
submitted.info <- merge(submitted.info, tmp, by='IDX', all.x=1)
# path distance of clusters
cat('\nPath distances on clusters')
tmp <- subset(submitted.info, MODEL=='R')
tmp <- treedist.pathdifference.clusters.wrapper(tmp, ttrs, strs_rtt, tinfo, use.weight=FALSE)
tmp[, TAXA_NC:=NULL]
sclu.info <- merge(sclu.info, tmp, by=c('IDX','IDCLU'), all.x=1)
# save intermediate
save(strs, strs_rtt, strs_lsd, ttrs, trungps, tinfo, tbrl, tfiles, submitted.info, sclu.info, lba, file=gsub('.rda','_04PD.rda',file))
}
options(show.error.messages = FALSE)
readAttempt <- try(suppressWarnings(load(gsub('.rda','_05QD.rda',file))))
options(show.error.messages = TRUE)
if( inherits(readAttempt, "try-error") )
{
#
# quartet distances of complete trees
#
cat('\nQuartett distances on rooted trees')
tmp <- treedist.quartetdifference.wrapper(submitted.info, ttrs, strs_rtt)
tmp[, TAXA_NJ:=NULL]
submitted.info <- merge(submitted.info, tmp, by='IDX', all.x=1)
# quartet distance of clusters
cat('\nQuartett distances on clusters')
tmp <- treedist.quartetdifference.clusters.wrapper(submitted.info, ttrs, strs_rtt, tinfo)
tmp[, TAXA_NC:=NULL]
sclu.info <- merge(sclu.info, tmp, by=c('IDX','IDCLU'), all.x=1)
save(strs, strs_rtt, strs_lsd, ttrs, trungps, tinfo, tbrl, tfiles, submitted.info, sclu.info, lba, file=gsub('.rda','_05QD.rda',file))
}
options(show.error.messages = FALSE)
readAttempt <- try(suppressWarnings(load(gsub('.rda','_06PDLSD.rda',file))))
options(show.error.messages = TRUE)
if( inherits(readAttempt, "try-error") )
{
#
# path distance of complete LSD trees
#
cat('\nPath distances on LSD trees')
tmp <- subset(submitted.info, WITH_LSD=='Y')
tmp <- treedist.pathdifference.wrapper(tmp, ttrs, strs_lsd, use.brl=FALSE, use.weight=TRUE)
setnames(tmp, c('PD','NPD','NPDSQ'), c('PD_LSD','NPD_LSD','NPDSQ_LSD'))
tmp[, TAXA_NJ:=NULL]
submitted.info <- merge(submitted.info, tmp, by='IDX', all.x=1)
# path distance of LSD clusters
cat('\nPath distances on LSD clusters')
tmp <- subset(submitted.info, WITH_LSD=='Y' & MODEL=='R')
tmp <- treedist.pathdifference.clusters.wrapper(tmp, ttrs, strs_lsd, tinfo, use.brl=FALSE, use.weight=TRUE)
setnames(tmp, c('PD','NPD','NPDSQ'), c('PD_LSD','NPD_LSD','NPDSQ_LSD'))
tmp[, TAXA_NC:=NULL]
sclu.info <- merge(sclu.info, tmp, by=c('IDX','IDCLU'), all.x=1)
# save intermediate
save(strs, strs_rtt, strs_lsd, ttrs, trungps, tinfo, tbrl, tfiles, submitted.info, sclu.info, lba, file=gsub('.rda','_06PDLSD.rda',file))
}
options(show.error.messages = FALSE)
readAttempt <- try(suppressWarnings(load(gsub('.rda','_07MSELSD.rda',file))))
options(show.error.messages = TRUE)
if( inherits(readAttempt, "try-error") )
{
# this is mem intensive, so now moved to cluster
tbrl <- lapply(seq_len(nrow(tfiles)), function(i)
{
ph <- ttrs[[tfiles[i, IDX_T]]]
tmp <- cophenetic.phylo(ph)
tmp <- as.data.table(melt(tmp))
setnames(tmp, c('Var1','Var2','value'),c('TAXA1','TAXA2','PD'))
tmp <- subset(tmp, TAXA1!=TAXA2)
tmp[, IDX_T:= tfiles[i, IDX_T]]
tmp[, SC:= tfiles[i, SC]]
tmp[, BRL_T:= tfiles[i, BRL_T]]
tmp[, TAXAN_T:= tfiles[i, TAXAN_T]]
tmp
})
tbrl <- do.call('rbind',tbrl)
# MSE between true time distances and reconstructed patristic distances in LSD tree
cat('\nMSE of edges on LSD trees')
tmp <- subset(submitted.info, WITH_LSD=='Y')
tmp <- treedist.MSE.wrapper(tmp, strs_lsd, tbrl, tinfo, use.brl=FALSE)
setnames(tmp, c('MSE','MAE','MSE_TP','MAE_TP'), c('MSE_LSD','MAE_LSD','MSE_TP_LSD','MAE_TP_LSD'))
tmp[, TAXA_NJ:=NULL]
submitted.info <- merge(submitted.info, tmp, by='IDX', all.x=1)
cat('\nMSE of edges on LSD clusters')
tmp <- subset(submitted.info, WITH_LSD=='Y' & MODEL=='R')
tmp <- treedist.MSE.clusters.wrapper(tmp, strs_lsd, tbrl, tinfo, use.brl=FALSE)
setnames(tmp, c('MSE','MAE','MSE_TP','MAE_TP'), c('MSE_LSD','MAE_LSD','MSE_TP_LSD','MAE_TP_LSD'))
tmp[, TAXA_NC:=NULL]
sclu.info <- merge(sclu.info, tmp, by=c('IDX','IDCLU'), all.x=1)
# save intermediate
save(strs, strs_rtt, strs_lsd, ttrs, trungps, tinfo, tbrl, tfiles, submitted.info, sclu.info, lba, file=gsub('.rda','_07MSELSD.rda',file))
save(strs, strs_rtt, strs_lsd, ttrs, trungps, tinfo, tfiles, submitted.info, sclu.info, lba, file=gsub('.rda','_07MSELSD_noTBRL.rda',file))
gc()
}
options(show.error.messages = FALSE)
readAttempt <- try(suppressWarnings(load(gsub('.rda','_08MSEGD.rda',file))))
options(show.error.messages = TRUE)
if( inherits(readAttempt, "try-error") )
{
# MSE between true time distances and reconstructed patristic distances in LSD tree
cat('\nMSE of edges on SUB trees')
tmp <- subset(submitted.info, MODEL=='R')
tmp <- treedist.MSE.wrapper(tmp, strs_rtt, tbrl, tinfo, use.brl=TRUE)
setnames(tmp, c('MSE','MAE','MSE_TP','MAE_TP'), c('MSE_GD','MAE_GD','MSE_TP_GD','MAE_TP_GD'))
tmp[, TAXA_NJ:=NULL]
tmp[, EDGE_NJ:=NULL]
submitted.info <- merge(submitted.info, tmp, by='IDX', all.x=1)
cat('\nMSE of edges on SUB clusters')
tmp <- subset(submitted.info, MODEL=='R')
tmp <- treedist.MSE.clusters.wrapper(tmp, strs_rtt, tbrl, tinfo, use.brl=TRUE)
setnames(tmp, c('MSE','MAE','MSE_TP','MAE_TP'), c('MSE_GD','MAE_GD','MSE_TP_GD','MAE_TP_GD'))
tmp[, TAXA_NC:=NULL]
tmp[, EDGE_NC:=NULL]
sclu.info <- merge(sclu.info, tmp, by=c('IDX','IDCLU'), all.x=1)
# save intermediate
save(strs, strs_rtt, strs_lsd, ttrs, trungps, tinfo, tfiles, submitted.info, sclu.info, lba, file=gsub('.rda','_08MSEGD.rda',file))
}
options(show.error.messages = FALSE)
readAttempt <- try(suppressWarnings(load(gsub('.rda','_09SBRL.rda',file))))
options(show.error.messages = TRUE)
if( inherits(readAttempt, "try-error") )
{
#
# sum of branch lengths per tree
#
tmp <- unique(subset(tinfo, select=IDX_T))
tmp <- tmp[, {
ph <- ttrs[[IDX_T]]
list(SUM_BRANCHES_T=sum(ph$edge.length))
}, by='IDX_T']
setnames(tmp, 'IDX_T', 'SUB_IDX_T')
submitted.info <- merge(submitted.info, tmp, by='SUB_IDX_T')
tmp <- submitted.info[, {
#IDX<-638; SUB_IDX_T<- 1
#cat(IDX,'\n')
ph <- strs[[IDX]]
list(SUM_BRANCHES=sum(ph$edge.length))
}, by=c('IDX')]
submitted.info <- merge(submitted.info, tmp, by='IDX')
# save intermediate
save(strs, strs_rtt, strs_lsd, ttrs, trungps, tinfo, tfiles, submitted.info, sclu.info, lba, file=gsub('.rda','_09SBRL.rda',file))
}
#
# ADD other summaries
#
#load( file.path(outdir, 'submitted_160704_KC.rda') )
#sclu.info.kc <- copy(sclu.info)
#load( file.path(outdir, 'submitted_160627_QDPD.rda') )
#sclu.info <- merge(sclu.info, subset(sclu.info.kc, select=c(IDX, TEAM, GENE, BRL, IDCLU, KC)), by=c('IDX','TEAM','GENE','BRL','IDCLU'))
#save(strs, strs_rtt, ttrs, tinfo, submitted.info, sclu.info, lba, file=file.path(outdir,'submitted_160627_QDPDKC.rda'))
}
#
#
#
treecomparison.submissions.151101<- function()
{
require(data.table)
require(ape)
require(phangorn)
#
# get true trees
#
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim_internal/freeze_July15'
tfiles <- list.files(indir, pattern='newick$', full.names=TRUE)
tfiles <- data.table( FILE_T=tfiles[ grepl('SUBSTTREE', tfiles) | grepl('Vill_99', tfiles) | grepl('Vill.*DATEDTREE', tfiles) ] )
tfiles[, SC:= toupper(gsub('_SUBSTTREE|_DATEDTREE','',gsub('.newick','',basename(FILE_T))))]
tmp <- rbind( subset(tfiles, SC=='VILL_99_APR15'), subset(tfiles, SC=='VILL_99_APR15'), subset(tfiles, SC=='VILL_99_APR15') )
set(tmp, NULL, 'SC', c('150701_VILL_SCENARIO-C','150701_VILL_SCENARIO-D','150701_VILL_SCENARIO-E'))
tfiles <- rbind(tfiles, tmp)
ttrs <- lapply(tfiles[, FILE_T], function(x) read.tree(file=x) )
names(ttrs) <- tfiles[, SC]
tfiles[, IDX_T:=seq_along(ttrs)]
tfiles[, TAXAN_T:= sapply(ttrs, Ntip)]
# info on true trees
tinfo <- merge(tfiles, do.call('rbind',lapply(seq_along(ttrs), function(i) data.table(TAXA=ttrs[[i]]$tip.label, IDX_T=i))), by='IDX_T')
tinfo[, IDPOP:=NA_character_]
tmp <- tinfo[, which(grepl('REGIONAL',SC))]
set(tinfo, tmp, 'IDPOP', tinfo[tmp,regmatches(TAXA, regexpr('IDPOP_[0-9]+',TAXA))])
tmp <- tinfo[, which(!grepl('REGIONAL',SC))]
set(tinfo, tmp, 'IDPOP', tinfo[tmp, regmatches(TAXA, regexpr('HOUSE[0-9]+-[0-9]+|House[0-9]+-[0-9]+',TAXA))])
stopifnot(subset(tinfo, grepl('VILL',SC))[, length(which(substring(TAXA,1,10)!=substring(IDPOP,1,10)))]==0)
stopifnot( tinfo[, length(which(is.na(IDPOP)))==0] )
set(tinfo, NULL, 'IDPOP', tinfo[,toupper(IDPOP)])
set(tinfo, NULL, 'TAXA', tinfo[,toupper(TAXA)])
# read cluster membership from DATEDCLUTREES
tmp <- list.files(indir, pattern='DATEDCLUTREES', full.names=TRUE)
tmp <- data.table( FILE_CLU_T= tmp, SC= toupper(gsub('_DATEDCLUTREES','',gsub('.newick','',basename(tmp)))))
tfiles <- merge(tfiles, tmp, by='SC', all=1)
tmp <- subset(tfiles, !is.na(FILE_CLU_T))[, {
z <- read.tree(FILE_CLU_T)
do.call('rbind',lapply(seq_along(z), function(i) data.table(IDCLU=i, TAXA=z[[i]]$tip.label)))
}, by='SC']
tinfo <- merge(tinfo, tmp, by=c('SC','TAXA'), all=1)
tmp <- subset(tinfo, !is.na(IDCLU))[, list(CLU_N= length(IDPOP)), by=c('SC','IDCLU')]
tinfo <- merge(tinfo, tmp, by=c('SC','IDCLU'), all=1)
# read sequences and determine %gappiness
tmp <- list.files(indir, pattern='fa$|fasta$', full.names=TRUE)
tmp <- data.table( FILE_SEQ_T= tmp, SC= toupper(gsub('_SIMULATED','',gsub('.fa','',basename(tmp)))))
z <- subset(tmp, SC=='VILL_99_APR15')
set(z, NULL, 'SC', '150701_VILL_SCENARIO-C')
tmp <- rbind( tmp, z )
tfiles <- merge(tfiles, tmp, by='SC', all=1)
tmp <- subset(tfiles, !is.na(FILE_SEQ_T))[, {
z <- read.dna(FILE_SEQ_T, format='fasta')
ans <- sapply(seq_len(nrow(z)), function(i) base.freq(z[i,], all=1))
ans <- apply(ans[c('n','-','?'),], 2, sum)
list(TAXA=rownames(z), GPS=ans)
}, by='SC']
tinfo <- merge(tinfo, tmp, by=c('SC','TAXA'), all.x=1)
#
# get submitted trees
#
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/IQTree/IQTree201507'
infiles <- list.files(indir, pattern='treefile$', recursive=1, full.names=1)
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/IQTree/IQTree201510'
infiles <- c(infiles, list.files(indir, pattern='treefile$', recursive=1, full.names=1))
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/PhyML'
infiles <- c(infiles, list.files(indir, pattern='*tree*', recursive=1, full.names=1))
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/RAxML'
infiles <- c(infiles, list.files(indir, pattern='*RAxML_bestTree*', recursive=1, full.names=1))
infiles <- c(infiles, list.files(indir, pattern="best_tree.newick", recursive=1, full.names=1))
infiles <- data.table(FILE=infiles)
strs <- lapply(infiles[, FILE], function(x)
{
cat(x)
read.tree(file=x)
})
names(strs) <- infiles[, FILE]
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/MetaPIGA'
tmp <- list.files(indir, pattern='*result*', recursive=1, full.names=1)
tmp <- data.table(FILE=tmp)
tmp.trees <- lapply(tmp[, FILE], function(x)
{
cat(x)
read.nexus(file=x)
})
sapply(tmp.trees, length)
MetaPIGA.trees <- c(lapply(tmp.trees, '[[', 1), lapply(tmp.trees, '[[', 2), lapply(tmp.trees, '[[', 3), lapply(tmp.trees, '[[', 4))
names(MetaPIGA.trees) <- c(sapply(tmp.trees, function(x) paste(names(x)[1],'_use',sep='')), sapply(tmp.trees, function(x) names(x)[2]), sapply(tmp.trees, function(x) names(x)[3]), sapply(tmp.trees, function(x) names(x)[4]))
names(MetaPIGA.trees) <- gsub("'",'',names(MetaPIGA.trees), fixed=1)
strs <- c(strs, MetaPIGA.trees)
submitted.info <- data.table(FILE=names(strs))
#
#
#
submitted.info[, IDX:=seq_along(strs)]
submitted.info[, TEAM:=NA_character_]
set(submitted.info, submitted.info[, which(grepl('RAXML|RAxML',FILE))], 'TEAM', 'RAXML')
set(submitted.info, submitted.info[, which(grepl('IQTree',FILE))], 'TEAM', 'IQTree')
set(submitted.info, submitted.info[, which(grepl('MetaPIGA|Consensus pruning|Best individual of population',FILE))], 'TEAM', 'MetaPIGA')
set(submitted.info, submitted.info[, which(grepl('PhyML',FILE))], 'TEAM', 'PhyML')
stopifnot( submitted.info[, length(which(is.na(TEAM)))==0] )
#
# scenario
#
submitted.info[, SC:=NA_character_]
tmp <- submitted.info[, which(grepl('150701_Regional_TRAIN[0-9]', FILE))]
set(submitted.info, tmp, 'SC', submitted.info[tmp, regmatches(FILE, regexpr('150701_Regional_TRAIN[0-9]',FILE))])
tmp <- submitted.info[, which(grepl('150701_Vill_SCENARIO-[A-Z]', FILE))]
set(submitted.info, tmp, 'SC', submitted.info[tmp, regmatches(FILE, regexpr('150701_Vill_SCENARIO-[A-Z]',FILE))])
tmp <- submitted.info[, which(is.na(SC) & grepl('TRAIN[0-9]', FILE))]
set(submitted.info, tmp, 'SC', submitted.info[tmp, paste('150701_Regional_',regmatches(FILE, regexpr('TRAIN[0-9]',FILE)),sep='')])
tmp <- submitted.info[, which(is.na(SC) & grepl('scenario[A-Z]', FILE))]
set(submitted.info, tmp, 'SC', submitted.info[tmp, paste('150701_Vill_',regmatches(FILE, regexpr('scenario[A-Z]',FILE)),sep='')])
tmp <- submitted.info[, which(is.na(SC) & grepl('150701_regional_train[0-9]', FILE))]
set(submitted.info, tmp, 'SC', submitted.info[tmp, regmatches(FILE, regexpr('150701_regional_train[0-9]',FILE))])
tmp <- submitted.info[, which(is.na(SC) & grepl('150701_vill_scenario-[A-Z]', FILE))]
set(submitted.info, tmp, 'SC', submitted.info[tmp, regmatches(FILE, regexpr('150701_vill_scenario-[A-Z]',FILE))])
tmp <- submitted.info[, which(is.na(SC) & grepl('Vill_99_Apr15', FILE))]
set(submitted.info, tmp, 'SC', 'Vill_99_Apr15')
set(submitted.info, NULL, 'SC', submitted.info[, toupper(SC)])
tmp <- submitted.info[, which(grepl('150701_VILL_SCENARIO[A-Z]', SC))]
set(submitted.info, tmp, 'SC', submitted.info[tmp, gsub('150701_VILL_SCENARIO','150701_VILL_SCENARIO-',SC)])
stopifnot( submitted.info[, length(which(is.na(SC)))==0] )
#
# set covariates of scenarios
#
tmp <- data.table( SC= c("150701_REGIONAL_TRAIN1","150701_REGIONAL_TRAIN2","150701_REGIONAL_TRAIN3","150701_REGIONAL_TRAIN4" ,"150701_REGIONAL_TRAIN5", "150701_VILL_SCENARIO-A", "150701_VILL_SCENARIO-B", "VILL_99_APR15","150701_VILL_SCENARIO-C", "150701_VILL_SCENARIO-D", "150701_VILL_SCENARIO-E"),
MODEL= c('R','R','R','R','R','V','V','V','V','V','V'),
SEQCOV= c(0.16, 0.16, 0.16, 0.16, 0.16, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6),
ACUTE= c('low', 'low', 'high', 'low', 'high', 'high', 'high', 'high', 'high', 'high', 'high'),
GAPS= c('none', 'low', 'low', 'high', 'high', 'low', 'high', 'none', 'none', 'low', 'high'),
ART= c('none', 'none', 'none', 'none', 'none', 'none', 'none', 'fast', 'fast', 'fast', 'fast'),
EXT= c('5pc', '5pc', '5pc', '5pc', '5pc', '~0pc', '~0pc', '~0pc', '~0pc', '~0pc', '~0pc')
)
submitted.info <- merge(submitted.info, tmp, by='SC')
#
# best tree for each scenario
#
submitted.info[, BEST:='N']
set(submitted.info, submitted.info[, which(grepl('RAxML', FILE) & grepl('best_tree', FILE))], 'BEST', 'Y')
# copied from ListOfBestTrees_IQTree150818.txt
# there are several best trees for some scenarios
tmp <- c( '150701_Vill_SCENARIO-A_IQTree150814_partition_12_3_07',
'150701_Vill_SCENARIO-A_IQTree150814_partition_12_3_04.',
'150701_Vill_SCENARIO-B_IQTree150814_partition_12_3_03.',
'Vill_99_Apr15_IQTree150814_partition_123.',
'150701_Vill_SCENARIO-D_IQTree150814_partition_12_3.',
'150701_Vill_SCENARIO-E_IQTree150814_partition_12_3.',
'150701_Vill_SCENARIO-A_IQTree150814_pol_partition_12_3.',
'150701_Vill_SCENARIO-B_IQTree150814_pol_partition_12_3_05.',
'Vill_99_Apr15_IQTree150814_pol_partition_12_3_09.',
'Vill_99_Apr15_IQTree150814_pol_partition_12_3_10.',
'150701_Vill_SCENARIO-D_IQTree150814_pol_partition_12_3_05.',
'150701_Vill_SCENARIO-D_IQTree150814_pol_partition_12_3_06.',
'150701_Vill_SCENARIO-D_IQTree150814_pol_partition_12_3_09.',
'150701_Vill_SCENARIO-E_IQTree150814_pol_partition_12_3_06.',
'150701_Regional_TRAIN1_IQTree150818_partition_123_03.',
'150701_Regional_TRAIN1_IQTree150818_pol_partition_123_05.')
tmp <- sapply(tmp, function(x) submitted.info[, which((grepl('IQTree150814/', FILE, fixed=1) | grepl('IQTree150818/', FILE, fixed=1)) & grepl(x, FILE, fixed=1))] )
set(submitted.info, tmp, 'BEST', 'Y')
tmp <- c('150701_Regional_TRAIN2_IQTree151019_partition_123_10',
'150701_Regional_TRAIN3_IQTree151019_partition_123_03',
'150701_Regional_TRAIN4_IQTree151019_partition_123_10',
'150701_Regional_TRAIN5_IQTree151019_partition_123_01',
'150701_Regional_TRAIN2_IQTree151019_pol_partition_123_08',
'150701_Regional_TRAIN3_IQTree151019_pol_partition_123_08',
'150701_Regional_TRAIN4_IQTree151019_pol_partition_123_05',
'150701_Regional_TRAIN5_IQTree151019_pol_partition_123_10')
tmp <- sapply(tmp, function(x) submitted.info[, which((grepl('IQTree151019', FILE, fixed=1)) & grepl(x, FILE, fixed=1))] )
set(submitted.info, tmp, 'BEST', 'Y')
# PhyML no replicates: all files best
set(submitted.info, submitted.info[, which(TEAM=='PhyML')], 'BEST', 'Y')
#
# set OTHER (ie old or some preliminary/unknown tree)
#
submitted.info[, OTHER:='N']
# MetaPIGA tree to be used is first in nexus list (which was tagged with best above)
set(submitted.info, submitted.info[, which(TEAM=='MetaPIGA' & !grepl('use', FILE))], 'OTHER', 'Y')
# IQTree did several uploads, use only most recent in main analysis
set(submitted.info, submitted.info[, which(grepl('150701_Regional_TRAIN1_IQTree150814', FILE))], 'OTHER', 'Y')
set(submitted.info, submitted.info[, which(TEAM=='IQTree' & MODEL=='R' & !grepl('TRAIN1', SC) & grepl('201507/',FILE,fixed=1))], 'OTHER', 'Y')
#
# set which gene used to construct tree (either pol or concatenated gag+pol+env)
#
submitted.info[, GENE:=NA_character_]
set(submitted.info, submitted.info[, which(TEAM=='RAXML' & grepl('full', FILE))], 'GENE', 'GAG+POL+ENV')
set(submitted.info, submitted.info[, which(TEAM=='RAXML' & grepl('pol', FILE))], 'GENE', 'POL')
stopifnot(nrow(subset(submitted.info, TEAM=='RAXML' & is.na(GENE)))==0)
set(submitted.info, submitted.info[, which(TEAM=='PhyML')], 'GENE', 'POL')
set(submitted.info, submitted.info[, which(TEAM=='MetaPIGA')], 'GENE', 'GAG+POL+ENV')
set(submitted.info, submitted.info[, which(TEAM=='IQTree' & grepl('[0-9]_partition', FILE))], 'GENE', 'GAG+POL+ENV')
set(submitted.info, submitted.info[, which(TEAM=='IQTree' & grepl('[0-9]_pol_partition', FILE))], 'GENE', 'POL')
stopifnot(nrow(subset(submitted.info, TEAM=='IQTree' & is.na(GENE)))==0)
#
# number taxa in tree
#
setkey(submitted.info, IDX)
submitted.info[, TAXAN:= sapply(strs, Ntip)]
#
# are trees rooted?
#
setkey(submitted.info, IDX)
submitted.info[, ROOTED:=factor(sapply(strs, is.rooted),levels=c(TRUE,FALSE),labels=c('Y','N'))]
#
# add index of true tree
#
require(phangorn)
submitted.info <- merge(submitted.info, subset(tfiles, select=c('SC','IDX_T','TAXAN_T')), by='SC')
stopifnot(nrow(subset(submitted.info, TAXAN>TAXAN_T))==0)
#
# fix taxa names that teams have changed
#
tmp <- subset(submitted.info, TEAM=='IQTree' & MODEL=='R')[, IDX]
for(i in tmp)
{
strs[[i]]$tip.label <- sapply(strsplit(strs[[i]]$tip.label,'_'), function(x) paste(x[1],'_',x[2],'|',x[3],'|',x[4],'_',x[5],'|',x[6],sep='') )
}
for(i in seq_along(strs))
{
strs[[i]]$tip.label <- toupper(strs[[i]]$tip.label)
}
for(i in seq_along(ttrs))
{
ttrs[[i]]$tip.label <- toupper(ttrs[[i]]$tip.label)
}
###
tmp <- subset(submitted.info, TEAM=='PhyML' & MODEL=='R')[, IDX]
for(i in tmp)
{
z <- data.table(IDX=seq_along(strs[[i]]$tip.label), IDPOP=regmatches(strs[[i]]$tip.label, regexpr('IDPOP_[0-9]+',strs[[i]]$tip.label)), SC=subset(submitted.info, IDX==i)[,SC])
z <- merge(subset(tinfo, select=c(IDPOP,SC,TAXA)), z, by=c('IDPOP','SC'))
setkey(z, IDX)
strs[[i]]$tip.label <- z[, TAXA]
}
tmp <- subset(submitted.info, TEAM=='PhyML' & MODEL=='V')[, IDX]
for(i in tmp)
{
z <- data.table(IDX=seq_along(strs[[i]]$tip.label), IDPOP=regmatches(strs[[i]]$tip.label, regexpr('HOUSE[0-9]+-[0-9]+|House[0-9]+-[0-9]+',strs[[i]]$tip.label)), SC=subset(submitted.info, IDX==i)[,SC])
z <- merge(subset(tinfo, select=c(IDPOP,SC,TAXA)), z, by=c('IDPOP','SC'))
stopifnot(nrow(z)==length(strs[[i]]$tip.label))
setkey(z, IDX)
strs[[i]]$tip.label <- z[, TAXA]
}
#
# compute Robinson Fould of complete tree
#
tmp <- treedist.robinsonfould.wrapper(submitted.info, ttrs, strs)
submitted.info <- merge(submitted.info, tmp, by='IDX')
# compute Robinson Fould of clusters, then take sum
tmp <- treedist.robinsonfouldclusters.wrapper(submitted.info, ttrs, strs, tinfo)
sclu.info <- merge(submitted.info, tmp, by='IDX')
#
outfile <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/evaluation/submitted_151101.rda'
save(strs, ttrs, tinfo, submitted.info, sclu.info, file=outfile)
}
treecomparison.submissions.151016<- function()
{
require(data.table)
require(ape)
require(phangorn)
#
# get true trees
#
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim_internal/freeze_July15'
tfiles <- list.files(indir, pattern='newick$', full.names=TRUE)
tfiles <- data.table( FILE_T=tfiles[ grepl('SUBSTTREE', tfiles) | grepl('Vill_99', tfiles) | grepl('Vill.*DATEDTREE', tfiles) ] )
tfiles[, SC:= toupper(gsub('_SUBSTTREE|_DATEDTREE','',gsub('.newick','',basename(FILE_T))))]
tmp <- rbind( subset(tfiles, SC=='VILL_99_APR15'), subset(tfiles, SC=='VILL_99_APR15'), subset(tfiles, SC=='VILL_99_APR15') )
set(tmp, NULL, 'SC', c('150701_VILL_SCENARIO-C','150701_VILL_SCENARIO-D','150701_VILL_SCENARIO-E'))
tfiles <- rbind(tfiles, tmp)
ttrs <- lapply(tfiles[, FILE_T], function(x) read.tree(file=x) )
names(ttrs) <- tfiles[, SC]
tfiles[, IDX_T:=seq_along(ttrs)]
tfiles[, TAXAN_T:= sapply(ttrs, Ntip)]
# info on true trees
tinfo <- merge(tfiles, do.call('rbind',lapply(seq_along(ttrs), function(i) data.table(TAXA=ttrs[[i]]$tip.label, IDX_T=i))), by='IDX_T')
tinfo[, IDPOP:=NA_character_]
tmp <- tinfo[, which(grepl('REGIONAL',SC))]
set(tinfo, tmp, 'IDPOP', tinfo[tmp,regmatches(TAXA, regexpr('IDPOP_[0-9]+',TAXA))])
tmp <- tinfo[, which(!grepl('REGIONAL',SC))]
set(tinfo, tmp, 'IDPOP', tinfo[tmp, regmatches(TAXA, regexpr('HOUSE[0-9]+-[0-9]+|House[0-9]+-[0-9]+',TAXA))])
stopifnot(subset(tinfo, grepl('VILL',SC))[, length(which(substring(TAXA,1,10)!=substring(IDPOP,1,10)))]==0)
stopifnot( tinfo[, length(which(is.na(IDPOP)))==0] )
set(tinfo, NULL, 'IDPOP', tinfo[,toupper(IDPOP)])
set(tinfo, NULL, 'TAXA', tinfo[,toupper(TAXA)])
# read cluster membership from DATEDCLUTREES
tmp <- list.files(indir, pattern='DATEDCLUTREES', full.names=TRUE)
tmp <- data.table( FILE_CLU_T= tmp, SC= toupper(gsub('_DATEDCLUTREES','',gsub('.newick','',basename(tmp)))))
tfiles <- merge(tfiles, tmp, by='SC', all=1)
tmp <- subset(tfiles, !is.na(FILE_CLU_T))[, {
z <- read.tree(FILE_CLU_T)
do.call('rbind',lapply(seq_along(z), function(i) data.table(IDCLU=i, TAXA=z[[i]]$tip.label)))
}, by='SC']
tinfo <- merge(tinfo, tmp, by=c('SC','TAXA'), all=1)
tmp <- subset(tinfo, !is.na(IDCLU))[, list(CLU_N= length(IDPOP)), by=c('SC','IDCLU')]
tinfo <- merge(tinfo, tmp, by=c('SC','IDCLU'), all=1)
# read sequences and determine %gappiness
tmp <- list.files(indir, pattern='fa$|fasta$', full.names=TRUE)
tmp <- data.table( FILE_SEQ_T= tmp, SC= toupper(gsub('_SIMULATED','',gsub('.fa','',basename(tmp)))))
z <- subset(tmp, SC=='VILL_99_APR15')
set(z, NULL, 'SC', '150701_VILL_SCENARIO-C')
tmp <- rbind( tmp, z )
tfiles <- merge(tfiles, tmp, by='SC', all=1)
tmp <- subset(tfiles, !is.na(FILE_SEQ_T))[, {
z <- read.dna(FILE_SEQ_T, format='fasta')
ans <- sapply(seq_len(nrow(z)), function(i) base.freq(z[i,], all=1))
ans <- apply(ans[c('n','-','?'),], 2, sum)
list(TAXA=rownames(z), GPS=ans)
}, by='SC']
tinfo <- merge(tinfo, tmp, by=c('SC','TAXA'), all.x=1)
#
# get submitted trees
#
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/IQTree/IQTree201507'
infiles <- list.files(indir, pattern='treefile$', recursive=1, full.names=1)
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/PhyML'
infiles <- c(infiles, list.files(indir, pattern='*tree*', recursive=1, full.names=1))
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/RAxML'
infiles <- c(infiles, list.files(indir, pattern='*RAxML_bestTree*', recursive=1, full.names=1))
infiles <- c(infiles, list.files(indir, pattern="best_tree.newick", recursive=1, full.names=1))
infiles <- data.table(FILE=infiles)
strs <- lapply(infiles[, FILE], function(x)
{
cat(x)
read.tree(file=x)
})
names(strs) <- infiles[, FILE]
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/MetaPIGA'
tmp <- list.files(indir, pattern='*result*', recursive=1, full.names=1)
tmp <- data.table(FILE=tmp)
tmp.trees <- lapply(tmp[, FILE], function(x)
{
cat(x)
read.nexus(file=x)
})
sapply(tmp.trees, length)
MetaPIGA.trees <- c(lapply(tmp.trees, '[[', 1), lapply(tmp.trees, '[[', 2), lapply(tmp.trees, '[[', 3), lapply(tmp.trees, '[[', 4))
names(MetaPIGA.trees) <- c(sapply(tmp.trees, function(x) paste(names(x)[1],'_use',sep='')), sapply(tmp.trees, function(x) names(x)[2]), sapply(tmp.trees, function(x) names(x)[3]), sapply(tmp.trees, function(x) names(x)[4]))
names(MetaPIGA.trees) <- gsub("'",'',names(MetaPIGA.trees), fixed=1)
strs <- c(strs, MetaPIGA.trees)
submitted.info <- data.table(FILE=names(strs))
#
#
#
submitted.info[, IDX:=seq_along(strs)]
submitted.info[, TEAM:=NA_character_]
set(submitted.info, submitted.info[, which(grepl('RAXML|RAxML',FILE))], 'TEAM', 'RAXML')
set(submitted.info, submitted.info[, which(grepl('IQTree',FILE))], 'TEAM', 'IQTree')
set(submitted.info, submitted.info[, which(grepl('MetaPIGA|Consensus pruning|Best individual of population',FILE))], 'TEAM', 'MetaPIGA')
set(submitted.info, submitted.info[, which(grepl('PhyML',FILE))], 'TEAM', 'PhyML')
stopifnot( submitted.info[, length(which(is.na(TEAM)))==0] )
#
# scenario
#
submitted.info[, SC:=NA_character_]
tmp <- submitted.info[, which(grepl('150701_Regional_TRAIN[0-9]', FILE))]
set(submitted.info, tmp, 'SC', submitted.info[tmp, regmatches(FILE, regexpr('150701_Regional_TRAIN[0-9]',FILE))])
tmp <- submitted.info[, which(grepl('150701_Vill_SCENARIO-[A-Z]', FILE))]
set(submitted.info, tmp, 'SC', submitted.info[tmp, regmatches(FILE, regexpr('150701_Vill_SCENARIO-[A-Z]',FILE))])
tmp <- submitted.info[, which(is.na(SC) & grepl('TRAIN[0-9]', FILE))]
set(submitted.info, tmp, 'SC', submitted.info[tmp, paste('150701_Regional_',regmatches(FILE, regexpr('TRAIN[0-9]',FILE)),sep='')])
tmp <- submitted.info[, which(is.na(SC) & grepl('scenario[A-Z]', FILE))]
set(submitted.info, tmp, 'SC', submitted.info[tmp, paste('150701_Vill_',regmatches(FILE, regexpr('scenario[A-Z]',FILE)),sep='')])
tmp <- submitted.info[, which(is.na(SC) & grepl('150701_regional_train[0-9]', FILE))]
set(submitted.info, tmp, 'SC', submitted.info[tmp, regmatches(FILE, regexpr('150701_regional_train[0-9]',FILE))])
tmp <- submitted.info[, which(is.na(SC) & grepl('150701_vill_scenario-[A-Z]', FILE))]
set(submitted.info, tmp, 'SC', submitted.info[tmp, regmatches(FILE, regexpr('150701_vill_scenario-[A-Z]',FILE))])
tmp <- submitted.info[, which(is.na(SC) & grepl('Vill_99_Apr15', FILE))]
set(submitted.info, tmp, 'SC', 'Vill_99_Apr15')
set(submitted.info, NULL, 'SC', submitted.info[, toupper(SC)])
tmp <- submitted.info[, which(grepl('150701_VILL_SCENARIO[A-Z]', SC))]
set(submitted.info, tmp, 'SC', submitted.info[tmp, gsub('150701_VILL_SCENARIO','150701_VILL_SCENARIO-',SC)])
stopifnot( submitted.info[, length(which(is.na(SC)))] )
#
# set covariates of scenarios
#
tmp <- data.table( SC= c("150701_REGIONAL_TRAIN1","150701_REGIONAL_TRAIN2","150701_REGIONAL_TRAIN3","150701_REGIONAL_TRAIN4" ,"150701_REGIONAL_TRAIN5", "150701_VILL_SCENARIO-A", "150701_VILL_SCENARIO-B", "VILL_99_APR15","150701_VILL_SCENARIO-C", "150701_VILL_SCENARIO-D", "150701_VILL_SCENARIO-E"),
MODEL= c('R','R','R','R','R','V','V','V','V','V','V'),
SEQCOV= c(0.16, 0.16, 0.16, 0.16, 0.16, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6),
ACUTE= c('low', 'low', 'high', 'low', 'high', 'high', 'high', 'high', 'high', 'high', 'high'),
GAPS= c('none', 'low', 'low', 'high', 'high', 'low', 'high', 'none', 'none', 'low', 'high'),
ART= c('none', 'none', 'none', 'none', 'none', 'none', 'none', 'fast', 'fast', 'fast', 'fast'),
EXT= c('5pc', '5pc', '5pc', '5pc', '5pc', '~0pc', '~0pc', '~0pc', '~0pc', '~0pc', '~0pc')
)
submitted.info <- merge(submitted.info, tmp, by='SC')
#
# best tree for each scenario
#
submitted.info[, BEST:='N']
set(submitted.info, submitted.info[, which(grepl('RAxML', FILE) & grepl('best_tree', FILE))], 'BEST', 'Y')
# copied from ListOfBestTrees_IQTree150818.txt
# there are several best trees for some scenarios
tmp <- c( '150701_Vill_SCENARIO-A_IQTree150814_partition_12_3_07',
'150701_Vill_SCENARIO-A_IQTree150814_partition_12_3_04.',
'150701_Vill_SCENARIO-B_IQTree150814_partition_12_3_03.',
'Vill_99_Apr15_IQTree150814_partition_123.',
'150701_Vill_SCENARIO-D_IQTree150814_partition_12_3.',
'150701_Vill_SCENARIO-E_IQTree150814_partition_12_3.',
'150701_Vill_SCENARIO-A_IQTree150814_pol_partition_12_3.',
'150701_Vill_SCENARIO-B_IQTree150814_pol_partition_12_3_05.',
'Vill_99_Apr15_IQTree150814_pol_partition_12_3_09.',
'Vill_99_Apr15_IQTree150814_pol_partition_12_3_10.',
'150701_Vill_SCENARIO-D_IQTree150814_pol_partition_12_3_05.',
'150701_Vill_SCENARIO-D_IQTree150814_pol_partition_12_3_06.',
'150701_Vill_SCENARIO-D_IQTree150814_pol_partition_12_3_09.',
'150701_Vill_SCENARIO-E_IQTree150814_pol_partition_12_3_06.',
'150701_Regional_TRAIN1_IQTree150818_partition_123_03.',
'150701_Regional_TRAIN2_IQTree150814_partition_123_03.',
'150701_Regional_TRAIN3_IQTree150814_partition_123_01.',
'150701_Regional_TRAIN4_IQTree150814_partition_123_02.',
'150701_Regional_TRAIN5_IQTree150814_partition_123.',
'150701_Regional_TRAIN1_IQTree150818_pol_partition_123_05.',
'150701_Regional_TRAIN2_IQTree150814_pol_partition_123_10.',
'150701_Regional_TRAIN3_IQTree150814_pol_partition_123_05.',
'150701_Regional_TRAIN3_IQTree150814_pol_partition_123_06.',
'150701_Regional_TRAIN3_IQTree150814_pol_partition_123_08.',
'150701_Regional_TRAIN4_IQTree150814_pol_partition_123_10.',
'150701_Regional_TRAIN5_IQTree150814_pol_partition_123_05.')
tmp <- sapply(tmp, function(x) submitted.info[, which((grepl('IQTree150814/', FILE, fixed=1) | grepl('IQTree150818/', FILE, fixed=1)) & grepl(x, FILE, fixed=1))] )
set(submitted.info, tmp, 'BEST', 'Y')
# PhyML no replicates: all files best
set(submitted.info, submitted.info[, which(TEAM=='PhyML')], 'BEST', 'Y')
#
# set OTHER (ie old or some preliminary/unknown tree)
#
submitted.info[, OTHER:='N']
# MetaPIGA tree to be used is first in nexus list (which was tagged with best above)
set(submitted.info, submitted.info[, which(TEAM=='MetaPIGA' & !grepl('use', FILE))], 'OTHER', 'Y')
# IQTree did several uploads, use only most recent in main analysis
set(submitted.info, submitted.info[, which(grepl('150701_Regional_TRAIN1_IQTree150814', FILE))], 'OTHER', 'Y')
#
# set which gene used to construct tree (either pol or concatenated gag+pol+env)
#
submitted.info[, GENE:=NA_character_]
set(submitted.info, submitted.info[, which(TEAM=='RAXML' & grepl('full', FILE))], 'GENE', 'GAG+POL+ENV')
set(submitted.info, submitted.info[, which(TEAM=='RAXML' & grepl('pol', FILE))], 'GENE', 'POL')
stopifnot(nrow(subset(submitted.info, TEAM=='RAXML' & is.na(GENE)))==0)
set(submitted.info, submitted.info[, which(TEAM=='PhyML')], 'GENE', 'POL')
set(submitted.info, submitted.info[, which(TEAM=='MetaPIGA')], 'GENE', 'GAG+POL+ENV')
set(submitted.info, submitted.info[, which(TEAM=='IQTree' & grepl('[0-9]_partition', FILE))], 'GENE', 'GAG+POL+ENV')
set(submitted.info, submitted.info[, which(TEAM=='IQTree' & grepl('[0-9]_pol_partition', FILE))], 'GENE', 'POL')
stopifnot(nrow(subset(submitted.info, TEAM=='IQTree' & is.na(GENE)))==0)
#
# number taxa in tree
#
setkey(submitted.info, IDX)
submitted.info[, TAXAN:= sapply(strs, Ntip)]
#
# are trees rooted?
#
setkey(submitted.info, IDX)
submitted.info[, ROOTED:=factor(sapply(strs, is.rooted),levels=c(TRUE,FALSE),labels=c('Y','N'))]
#
# add index of true tree
#
require(phangorn)
submitted.info <- merge(submitted.info, subset(tfiles, select=c('SC','IDX_T','TAXAN_T')), by='SC')
stopifnot(nrow(subset(submitted.info, TAXAN>TAXAN_T))==0)
#
# fix taxa names that teams have changed
#
tmp <- subset(submitted.info, TEAM=='IQTree' & MODEL=='R')[, IDX]
for(i in tmp)
{
strs[[i]]$tip.label <- sapply(strsplit(strs[[i]]$tip.label,'_'), function(x) paste(x[1],'_',x[2],'|',x[3],'|',x[4],'_',x[5],'|',x[6],sep='') )
}
for(i in seq_along(strs))
{
strs[[i]]$tip.label <- toupper(strs[[i]]$tip.label)
}
for(i in seq_along(ttrs))
{
ttrs[[i]]$tip.label <- toupper(ttrs[[i]]$tip.label)
}
###
tmp <- subset(submitted.info, TEAM=='PhyML' & MODEL=='R')[, IDX]
for(i in tmp)
{
z <- data.table(IDX=seq_along(strs[[i]]$tip.label), IDPOP=regmatches(strs[[i]]$tip.label, regexpr('IDPOP_[0-9]+',strs[[i]]$tip.label)), SC=subset(submitted.info, IDX==i)[,SC])
z <- merge(subset(tinfo, select=c(IDPOP,SC,TAXA)), z, by=c('IDPOP','SC'))
setkey(z, IDX)
strs[[i]]$tip.label <- z[, TAXA]
}
tmp <- subset(submitted.info, TEAM=='PhyML' & MODEL=='V')[, IDX]
for(i in tmp)
{
z <- data.table(IDX=seq_along(strs[[i]]$tip.label), IDPOP=regmatches(strs[[i]]$tip.label, regexpr('HOUSE[0-9]+-[0-9]+|House[0-9]+-[0-9]+',strs[[i]]$tip.label)), SC=subset(submitted.info, IDX==i)[,SC])
z <- merge(subset(tinfo, select=c(IDPOP,SC,TAXA)), z, by=c('IDPOP','SC'))
stopifnot(nrow(z)==length(strs[[i]]$tip.label))
setkey(z, IDX)
strs[[i]]$tip.label <- z[, TAXA]
}
#
# compute Robinson Fould of complete tree
#
tmp <- treedist.robinsonfould.wrapper(submitted.info, ttrs, strs)
submitted.info <- merge(submitted.info, tmp, by='IDX')
#
# compute path differences on complete trees
#
setkey(submitted.info, IDX)
#tmp <- subset(submitted.info, IDX==463)[1,]
#IDX<- 1; IDX_T<- 1
#IDX<- 822; IDX_T<- 11
tmp <- submitted.info[, {
cat('\nAt IDX', IDX)
stree <- unroot(strs[[IDX]])
otree <- unroot(multi2di(ttrs[[IDX_T]]))
if(!is.binary.tree(stree))
{
cat('\nFound non-binary tree at IDX',IDX)
stree <- multi2di(stree)
}
#print(stree)
#print(otree)
z <- setdiff(otree$tip.label, stree$tip.label)
stopifnot( length(z)==abs(diff(c(Ntip(otree), Ntip(stree)))) )
if(length(z))
otree <- unroot(drop.tip(otree, z))
#normalize with choose(n,2)
tmp <- treedist.pathdifference(otree, stree, lambda=0)
list(PD=tmp['path'], NPD=tmp['path.std'])
}, by='IDX']
submitted.info <- merge(submitted.info, tmp, by='IDX')
# compute Robinson Fould of clusters, then take sum
tmp <- treedist.robinsonfouldclusters.wrapper(submitted.info, ttrs, strs, tinfo)
sclu.info <- merge(submitted.info, tmp, by='IDX')
#
#
#
outfile <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/evaluation/submitted_151023.rda'
save(strs, ttrs, tinfo, submitted.info, sclu.info, file=outfile)
}
##--------------------------------------------------------------------------------------------------------
## olli 03.12.15
##--------------------------------------------------------------------------------------------------------
treecomparison.submissions.update.160430<- function()
{
edir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/evaluation'
timetag <- '160430'
#
# collect results so far
#
file <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/evaluation/submitted_151203.rda'
load(file)
#
# to tinfo add actual transmitters
#
# check TRAIN1
load( '/Users/Oliver/Dropbox (SPH Imperial College)/PANGEAHIVsim_internal/freeze_July15/150701_Regional_TRAIN1_SIMULATED_INTERNAL.R' )
ch <- subset(tinfo, SC=='150701_REGIONAL_TRAIN1' & BRL_T=='time', TAXA)
ch[, IDPOP:= as.integer(gsub('IDPOP_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',1)))]
ch[, GENDER_CH:= sapply(strsplit(TAXA,'|',fixed=1),'[[',2)]
ch[, DOB_CH:= as.numeric(gsub('DOB_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',3)))]
ch[, TIME_SEQ_CH:= as.numeric(sapply(strsplit(TAXA,'|',fixed=1),'[[',4))]
ch <- merge(subset(df.inds, select=c(IDPOP, GENDER, DOB, TIME_SEQ)), ch, by='IDPOP')
stopifnot( ch[, all(DOB==DOB_CH)], ch[, all(GENDER==GENDER_CH)], ch[, all(TIME_SEQ==TIME_SEQ_CH)] )
subset(ch, TIME_SEQ!=TIME_SEQ_CH)
# OK :-) schedule adding IDPOP_T
df.trms[, IDPOP:= df.trms[, paste('IDPOP_',IDREC,sep='')]]
tinfo.add <- subset(df.trms, select=c(IDPOP, IDTR))
df.trms[, IDPOP:= df.trms[, paste('IDPOP_',IDTR,sep='')]]
tinfo.add <- merge(tinfo.add, subset(df.trms, select=c(IDPOP, IDREC)), by='IDPOP', all=1)
set(ch, NULL, 'IDPOP', ch[, paste('IDPOP_',IDPOP,sep='')])
tinfo.add <- merge(tinfo.add, subset(ch, select=IDPOP), by='IDPOP')
tinfo.add[, SC:='150701_REGIONAL_TRAIN1']
# check TRAIN2
load( '/Users/Oliver/Dropbox (SPH Imperial College)/PANGEAHIVsim_internal/freeze_July15/150701_Regional_TRAIN2_SIMULATED_INTERNAL.R' )
ch <- subset(tinfo, SC=='150701_REGIONAL_TRAIN2' & BRL_T=='time', TAXA)
ch[, IDPOP:= as.integer(gsub('IDPOP_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',1)))]
ch[, GENDER_CH:= sapply(strsplit(TAXA,'|',fixed=1),'[[',2)]
ch[, DOB_CH:= as.numeric(gsub('DOB_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',3)))]
ch[, TIME_SEQ_CH:= as.numeric(sapply(strsplit(TAXA,'|',fixed=1),'[[',4))]
ch <- merge(subset(df.inds, select=c(IDPOP, GENDER, DOB, TIME_SEQ)), ch, by='IDPOP')
stopifnot( ch[, all(DOB==DOB_CH)], ch[, all(GENDER==GENDER_CH)], ch[, all(TIME_SEQ==TIME_SEQ_CH)] )
subset(ch, TIME_SEQ!=TIME_SEQ_CH)
# OK :-) schedule adding IDPOP_T
df.trms[, IDPOP:= df.trms[, paste('IDPOP_',IDREC,sep='')]]
tmp <- subset(df.trms, select=c(IDPOP, IDTR))
df.trms[, IDPOP:= df.trms[, paste('IDPOP_',IDTR,sep='')]]
tmp <- merge(tmp, subset(df.trms, select=c(IDPOP, IDREC)), by='IDPOP', all=1)
set(ch, NULL, 'IDPOP', ch[, paste('IDPOP_',IDPOP,sep='')])
tmp <- merge(tmp, subset(ch, select=IDPOP), by='IDPOP')
tmp[, SC:='150701_REGIONAL_TRAIN2']
tinfo.add <- rbind(tinfo.add, tmp)
# check TRAIN4
ch <- subset(tinfo, SC=='150701_REGIONAL_TRAIN4' & BRL_T=='time', TAXA)
ch[, IDPOP:= as.integer(gsub('IDPOP_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',1)))]
ch[, GENDER_CH:= sapply(strsplit(TAXA,'|',fixed=1),'[[',2)]
ch[, DOB_CH:= as.numeric(gsub('DOB_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',3)))]
ch[, TIME_SEQ_CH:= as.numeric(sapply(strsplit(TAXA,'|',fixed=1),'[[',4))]
ch <- merge(subset(df.inds, select=c(IDPOP, GENDER, DOB, TIME_SEQ)), ch, by='IDPOP')
stopifnot( ch[, all(DOB==DOB_CH)], ch[, all(GENDER==GENDER_CH)], ch[, all(TIME_SEQ==TIME_SEQ_CH)] )
subset(ch, TIME_SEQ!=TIME_SEQ_CH)
tmp[, SC:='150701_REGIONAL_TRAIN4']
tinfo.add <- rbind(tinfo.add, tmp)
# check TRAIN3
load( '/Users/Oliver/Dropbox (SPH Imperial College)/PANGEAHIVsim_internal/freeze_July15/150701_Regional_TRAIN3_SIMULATED_INTERNAL.R' )
ch <- subset(tinfo, SC=='150701_REGIONAL_TRAIN3' & BRL_T=='time', TAXA)
ch[, IDPOP:= as.integer(gsub('IDPOP_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',1)))]
ch[, GENDER_CH:= sapply(strsplit(TAXA,'|',fixed=1),'[[',2)]
ch[, DOB_CH:= as.numeric(gsub('DOB_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',3)))]
ch[, TIME_SEQ_CH:= as.numeric(sapply(strsplit(TAXA,'|',fixed=1),'[[',4))]
ch <- merge(subset(df.inds, select=c(IDPOP, GENDER, DOB, TIME_SEQ)), ch, by='IDPOP')
stopifnot( ch[, all(DOB==DOB_CH)], ch[, all(GENDER==GENDER_CH)], ch[, all(TIME_SEQ==TIME_SEQ_CH)] )
subset(ch, TIME_SEQ!=TIME_SEQ_CH)
# OK :-) schedule adding IDPOP_T
df.trms[, IDPOP:= df.trms[, paste('IDPOP_',IDREC,sep='')]]
tmp <- subset(df.trms, select=c(IDPOP, IDTR))
df.trms[, IDPOP:= df.trms[, paste('IDPOP_',IDTR,sep='')]]
tmp <- merge(tmp, subset(df.trms, select=c(IDPOP, IDREC)), by='IDPOP', all=1)
set(ch, NULL, 'IDPOP', ch[, paste('IDPOP_',IDPOP,sep='')])
tmp <- merge(tmp, subset(ch, select=IDPOP), by='IDPOP')
tmp[, SC:='150701_REGIONAL_TRAIN3']
tinfo.add <- rbind(tinfo.add, tmp)
# check TRAIN5
ch <- subset(tinfo, SC=='150701_REGIONAL_TRAIN5' & BRL_T=='time', TAXA)
ch[, IDPOP:= as.integer(gsub('IDPOP_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',1)))]
ch[, GENDER_CH:= sapply(strsplit(TAXA,'|',fixed=1),'[[',2)]
ch[, DOB_CH:= as.numeric(gsub('DOB_','',sapply(strsplit(TAXA,'|',fixed=1),'[[',3)))]
ch[, TIME_SEQ_CH:= as.numeric(sapply(strsplit(TAXA,'|',fixed=1),'[[',4))]
ch <- merge(subset(df.inds, select=c(IDPOP, GENDER, DOB, TIME_SEQ)), ch, by='IDPOP')
stopifnot( ch[, all(DOB==DOB_CH)], ch[, all(GENDER==GENDER_CH)], ch[, all(TIME_SEQ==TIME_SEQ_CH)] )
subset(ch, TIME_SEQ!=TIME_SEQ_CH)
tmp[, SC:='150701_REGIONAL_TRAIN5']
tinfo.add <- rbind(tinfo.add, tmp)
# add transmitters for regional to tinfo
tinfo <- merge(tinfo, tinfo.add, by=c('IDPOP', 'SC'), all.x=1)
#
# compute closest individual on true trees
#
tmp <- unique(subset(tinfo, select=c(SC, BRL_T, IDX_T)))
tmp <- tmp[, {
print(IDX_T)
ph <- ttrs[[IDX_T]]
model.reg <- grepl('REGIONAL',SC)
treedist.closest.ind(ph, model.reg)
}, by=c('SC','BRL_T','IDX_T')]
tinfo <- merge(tinfo, tmp, by=c('SC','BRL_T','IDX_T','IDPOP'))
set(tinfo, NULL, 'IDPOP_CL', tinfo[, gsub('IDPOP_','',IDPOP_CL)])
#
# compute closest individual on simulated trees and determine proportion if either transmitter or among recipients
#
sucl <- subset(submitted.info, MODEL=='R')[, {
print(IDX)
#IDX<- 557; SUB_IDX_T<-2; SC<- '150701_REGIONAL_TRAIN2'
ph <- strs[[IDX]]
model.reg <- grepl('REGIONAL',SC)
ans <- treedist.closest.ind(ph, model.reg)
ans <- subset(ans, GD<=0.045)
tmp <- subset(tinfo, IDX_T==SUB_IDX_T, c(IDPOP, IDTR, IDREC))
ans <- merge(ans, tmp, by='IDPOP')
set(ans, NULL, 'IDPOP_CL', ans[, gsub('IDPOP_','',IDPOP_CL)])
ans[, IDCL:= as.character(IDTR)]
tmp <- ans[, which(!is.na(IDREC))]
set(ans, tmp, 'IDCL', ans[tmp, paste(IDCL, IDREC, sep=',')])
if(nrow(ans))
{
tmp <- ans[, list(CLD= IDPOP_CL%in%strsplit(IDCL,',')[[1]]) , by='IDPOP']
tmp <- tmp[, mean(CLD)]
}
if(!nrow(ans))
tmp <- NA_real_
list(TR_REC_perc= tmp)
}, by=c('IDX')]
submitted.info <- merge(submitted.info, sucl, by='IDX', all.x=1)
# compute same proportion on true trees
tmp <- subset(tinfo, BRL_T=='subst')[, {
ans <- data.table(IDPOP=IDPOP, IDCL= as.character(IDTR), IDREC=IDREC, IDPOP_CL=IDPOP_CL, GD=GD)
ans <- subset(ans, GD<=0.045)
tmp <- ans[, which(!is.na(IDREC))]
set(ans, tmp, 'IDCL', ans[tmp, paste(IDCL, IDREC, sep=',')])
tmp <- ans[, list(CLD= IDPOP_CL%in%strsplit(IDCL,',')[[1]]) , by='IDPOP']
list(TR_REC_perc_T= tmp[, mean(CLD)])
}, by='IDX_T']
setnames(tmp, 'IDX_T','SUB_IDX_T')
submitted.info <- merge(submitted.info, tmp, by='SUB_IDX_T', all.x=1)
#
# save
#
outfile <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/evaluation/submitted_160430.rda'
save(strs, strs_lsd_brl, strs_lsd_date, ttrs, tinfo, submitted.info, sclu.info, ttdists1, RFttdists, file=outfile)
}
##--------------------------------------------------------------------------------------------------------
## olli 03.12.15
##--------------------------------------------------------------------------------------------------------
treecomparison.submissions.update.151203<- function()
{
edir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/evaluation'
timetag <- '151203'
#
# collect results so far
#
file <- "~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/evaluation/submitted_151119_SRFQD.rda"
load(file)
load("~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/evaluation/submitted_151203_CCinfowithTTdists.rda")
# loads myinfo ttdists1 RFttdists
submitted.info <- copy(myinfo)
setnames(submitted.info, c('RF','NRF','NQD','kc0','kc1','kc_x','kc_y','rf_x','rf_y'), c('SB_RF','SB_NRF','SB_NQD','LSD_KC_L0','LSD_KC_L1','LSD_KC_L0_MDSx','LSD_KC_L0_MDSy','LSD_RF_MDSx','LSD_RF_MDSy'))
setnames(sclu.info, c('NRFC','NQDC'), c('SB_NRFC','SB_NQDC'))
#
# calculate RF on 'strs_lsd_date'
#
# take topology of true dated trees and compare to topology of LSD dated trees with RF
tmp <- treedist.robinsonfould.wrapper(submitted.info, ttrs, strs_lsd_date)
setnames(tmp, c('RF','NRF'), c('LSD_RF','LSD_NRF'))
submitted.info <- merge(submitted.info, tmp, by='IDX')
# take topology of true dated trees and compare to topology of LSD dated trees with RF
tmp <- treedist.robinsonfouldclusters.wrapper(submitted.info, ttrs, strs_lsd_date, tinfo)
setnames(tmp, c('RFC','NRFC'), c('LSD_RFC','LSD_NRFC'))
sclu.info <- merge(sclu.info, subset(tmp, select=c(IDX, IDCLU, LSD_NRFC)), by=c('IDX','IDCLU'))
strs.new <- strs
ttrs.new <- ttrs
tinfo.new <- copy(tinfo)
submitted.info.new <- copy(submitted.info)
sclu.info.new <- copy(sclu.info)
outdir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/evaluation'
z <- load(paste(outdir, 'submitted_151119_S.rda', sep='/'))
stopifnot( nrow(subset(merge(subset(submitted.info, select=c('FILE','IDX')), subset(submitted.info.new, select=c('FILE','IDX')), by='FILE'), IDX.x!=IDX.y))==0 )
submitted.info <- merge(submitted.info.new, subset(submitted.info, select=c('IDX','NQD','lm_intercept','lm_slope','lm_rsq')), by='IDX')
strs <- strs.new
ttrs <- ttrs.new
sclu.info <- merge(sclu.info.new, subset(sclu.info, select=c('IDX','IDCLU','NQDC')), by=c('IDX','IDCLU'))
#
# save
#
outfile <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/evaluation/submitted_151203.rda'
save(strs, strs_lsd_brl, strs_lsd_date, ttrs, tinfo, submitted.info, sclu.info, ttdists1, RFttdists, file=outfile)
}
##--------------------------------------------------------------------------------------------------------
## olli 27.06.11
##--------------------------------------------------------------------------------------------------------
treecomparison.ana.160627.sclu<- function()
{
require(ggplot2)
require(data.table)
require(ape)
require(scales)
require(ggtree)
require(phangorn)
edir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/evaluation'
timetag <- '160713'
load(file.path(edir,'submitted_160713_07MSELSD.rda'))
set(sclu.info, sclu.info[, which(grepl('gag+pol+env',FILE,fixed=1))], 'GENE', 'GAG+POL+ENV')
sc <- copy(sclu.info)
tmp <- subset(submitted.info, TEAM=='RUNGAPS_ExaML', c(IDX, RUNGAPS, GENE))
sc <- merge(sc, tmp, by=c('IDX','GENE'), all.x=1)
sc <- merge(sc, data.table(GENE=c('P17','GAG','GAG+PARTIALPOL','POL','GAG+POL+ENV'), GENE_L=c(396, 1440, 3080, 2843, 6807)), by='GENE')
tmp <- unique( subset(tinfo, BRL_T=='subst' & grepl('REG',SC), select=c(SC, TAXA, ENV_GAPS_P, FULL_GAPS_P, GAG_GAPS_P, POL_GAPS_P)) )
tmp <- tmp[, list(FULL_GAPS_P=mean(FULL_GAPS_P), GAG_GAPS_P=mean(GAG_GAPS_P), POL_GAPS_P=mean(POL_GAPS_P), ENV_GAPS_P=mean(ENV_GAPS_P)), by='SC']
tmp <- melt(tmp, id.vars=c('SC'), variable.name='GENE', value.name='GAPS_P')
set(tmp, NULL, 'GENE', tmp[, gsub('FULL','GAG+POL+ENV',gsub('_GAPS_P','',GENE))])
sc <- merge(sc, tmp, by=c('SC','GENE'), all.x=1)
#
tmp <- subset(tinfo, !is.na(IDCLU))[, list(CLU_N=CLU_N[1], MXGPS_CLU= max(GPS), MDGPS_CLU=median(GPS)), by=c('SC','IDCLU')]
sc <- merge(sc, tmp, by=c('SC','IDCLU'))
set(sc, NULL, 'MODEL', sc[, factor(MODEL, levels=c('V','R'),labels=c('Model: Village','Model: Regional'))])
set(sc, sc[, which(SC=="VILL_99_APR15")],'SC',"150701_VILL_SCENARIO-C")
set(sc, NULL, 'SC', sc[, factor(SC, levels=c("150701_REGIONAL_TRAIN1", "150701_REGIONAL_TRAIN2", "150701_REGIONAL_TRAIN3", "150701_REGIONAL_TRAIN4","150701_REGIONAL_TRAIN5","150701_VILL_SCENARIO-A","150701_VILL_SCENARIO-B","150701_VILL_SCENARIO-C","150701_VILL_SCENARIO-D","150701_VILL_SCENARIO-E"),
labels=c('sc 1','sc 2','sc 3','sc 4','sc 5','sc A','sc B','sc C','sc D','sc E'))])
set(sc, NULL, 'GAPS', sc[, factor(GAPS, levels=c('none','low','high'),labels=c('none','as for\nBotswana\nsequences','as for\nUganda\nsequences'))])
set(sc, NULL, 'BEST', sc[, factor(BEST, levels=c('Y','N'),labels=c('best tree','replicate tree'))])
set(sc, sc[, which(GENE=='P17')], 'GENE', 'gag (p17)')
set(sc, sc[, which(GENE=='GAG')], 'GENE', 'gag')
set(sc, sc[, which(GENE=='GAG+PARTIALPOL')], 'GENE', 'gag + pol (prot,p51)')
set(sc, sc[, which(GENE=='POL')], 'GENE', 'pol')
set(sc, sc[, which(GENE=='GAG+POL+ENV')], 'GENE', 'gag+pol+env')
set(sc, sc[, which(TEAM=='IQTree')], 'TEAM', 'IQ-TREE')
set(sc, sc[, which(TEAM=='RAXML')], 'TEAM', 'RAxML')
set(sc, NULL, 'EXT', sc[, factor(EXT, levels=c('~0pc','5pc'),labels=c('~ 0%/year','5%/year'))])
set(sc, NULL, 'ART', sc[, factor(ART, levels=c('none','fast'),labels=c('none','fast'))])
sc <- subset(sc, OTHER=='N')
#
# add size adjusted KC
#
if(1)
{
require(gamlss)
kc.std.d <- subset(sc, TEAM!='MetaPIGA' & SC=='sc 1', select=c(SC, IDX, IDCLU, TEAM,GENE,CLU_N, KC))
kc.std.m3 <- gamlss(KC~I(CLU_N*(CLU_N-1)/2), data=kc.std.d)
tmp2 <- subset(sc, SC%in%c('sc 1','sc 2','sc 4'), select=c(SC,IDX, IDCLU, TEAM,GENE,CLU_N, KC))
tmp2[, KCadj:= KC / predict(kc.std.m3, data=kc.std.d, newdata=tmp2,type='response', se.fit=FALSE)]
ggplot(tmp2, aes(x=CLU_N)) + geom_point(aes(y=KCadj,colour=GENE, pch=TEAM)) + scale_y_log10() + facet_grid(~SC)
sc <- merge(sc, subset(tmp2, select=c(IDX, IDCLU, KCadj)), by=c('IDX','IDCLU'), all.x=1)
#
tmp <- melt(subset(sc, IDX==45), measure.var=c('NPD','NPDSQ','NRFC','NQDC','KCadj'))
ggplot( tmp, aes(x=CLU_N, y=value, colour=GENE, pch=TEAM)) + geom_point() + facet_grid(GENE+TEAM+IDX~variable)
#
# check dependence on size of cluster
#
ggplot( melt(sc, measure.var=c('NPD','NPDSQ','NRFC','NQDC','KCadj')), aes(x=CLU_N, y=value, colour=GENE, pch=TEAM)) + geom_point() + facet_grid(variable+TEAM+IDX~GENE, scales='free_y')
file <- file.path(edir, paste(timetag,'_','dependence_on_clustersize.pdf',sep=''))
ggsave(file=file, w=10, h=1000, limitsize = FALSE, useDingbats=FALSE)
}
#
#
#
sc <- sc[, list( NRFme=mean(NRFC, na.rm=TRUE),
NQDme=mean(NQDC, na.rm=TRUE),
NPDme=mean(NPD, na.rm=TRUE),
NPDSQme=mean(NPDSQ, na.rm=TRUE),
KCAme=mean(KCadj, na.rm=TRUE),
NRFmd=median(NRFC, na.rm=TRUE),
NQDmd=median(NQDC, na.rm=TRUE),
NPDmd=median(NPD, na.rm=TRUE),
NPDSQmd=median(NPDSQ, na.rm=TRUE),
KCAmd=median(KCadj, na.rm=TRUE)
), by=c('SC','GENE','GENE_L','TEAM','BEST','IDX','FILE','GAPS','GAPS_P','RUNGAPS','MODEL','TAXAN','TAXAN_T','ROOTED','SEQCOV','ART','ACUTE','EXT','OTHER')]
sc <- subset(sc, MODEL=='Model: Regional')
#
# KC distance standardized with TEAMS separated
#
tmp <- subset(sc, ACUTE=='low' & TEAM%in%c('IQ-TREE', 'PhyML', 'RAxML', 'MetaPIGA'))
set(tmp, NULL, 'TEAM', tmp[, factor(TEAM)])
set(tmp, NULL, 'GENE', tmp[, factor(GENE, levels=c('gag','pol','gag+pol+env'))])
ggplot(tmp, aes(x=GAPS)) +
geom_jitter(aes(y=KCAme, colour=GENE), position=position_jitter(w=0.8, h = 0), size=2) +
scale_colour_manual(values=c('gag'='red','pol'="grey60", 'gag+pol+env'="#3F4788FF")) +
scale_y_continuous(expand=c(0,0), limits=c(0,2.7)) + #, breaks=seq(0,1,0.1), minor_breaks=seq(0,1,0.05)) +
scale_shape_manual(values=c('IQ-TREE'=15, 'PhyML'=12, 'RAxML'=8, 'MetaPIGA'=17)) +
labs( x='\nUnassembled sites in simulated sequences',
y='incorrectly estimated topologies of subtrees with 4 taxa\n(standardized Quartett distance)\n',
colour='part of genome used\nfor tree reconstruction',
pch='algorithm') +
theme_bw() + theme(legend.position='bottom') + facet_grid(~TEAM)
file <- file.path(edir, paste(timetag,'_','KC_clumean_polvsall_by_gaps_taxan1600_Acute10pc_by_TEAM.pdf',sep=''))
ggsave(file=file, w=12, h=5, useDingbats=FALSE)
#
# KC distance standardized with TEAMS separated
#
tmp <- subset(sc, ACUTE=='low' & GENE=='gag+pol+env' & TEAM%in%c('IQ-TREE', 'PhyML', 'RAxML', 'MetaPIGA'))
set(tmp, NULL, 'TEAM', tmp[, factor(TEAM)])
set(tmp, NULL, 'GENE', tmp[, factor(GENE, levels=c('gag','pol','gag+pol+env'))])
ggplot(tmp, aes(x=GAPS)) +
geom_jitter(aes(y=KCAme, pch=TEAM, colour=GENE), position=position_jitter(w=0.8, h = 0), size=2) +
scale_colour_manual(values=c('gag'='red','pol'="grey60", 'gag+pol+env'="#3F4788FF")) +
scale_y_continuous(expand=c(0,0), limits=c(0,2)) + #, breaks=seq(0,1,0.1), minor_breaks=seq(0,1,0.05)) +
scale_shape_manual(values=c('IQ-TREE'=15, 'PhyML'=12, 'RAxML'=8, 'MetaPIGA'=17)) +
labs( x='\nunassembled PANGEA-HIV sequences',
y='standardized Quartett distance\n',
colour='part of simulated genome\nused or tree reconstruction',
pch='tree reconstruction\nmethod') +
theme_bw() + theme(legend.position='bottom')
file <- file.path(edir, paste(timetag,'_','KC_clumean_polvsall_by_gaps_taxan1600_Acute10pc_by_IQTree.pdf',sep=''))
ggsave(file=file, w=4.5, h=6, useDingbats=FALSE)
#
# quartett distance standardized by n choose 4 with TEAMS separated
#
tmp <- subset(sc, ACUTE=='low' & TEAM%in%c('IQ-TREE', 'PhyML', 'RAxML', 'MetaPIGA'))
set(tmp, NULL, 'TEAM', tmp[, factor(TEAM)])
set(tmp, NULL, 'GENE', tmp[, factor(GENE, levels=c('gag','pol','gag+pol+env'))])
ggplot(tmp, aes(x=GAPS)) +
geom_jitter(aes(y=NQDme, colour=GENE), position=position_jitter(w=0.8, h = 0), size=2) +
scale_colour_manual(values=c('gag'='red','pol'="grey60", 'gag+pol+env'="#3F4788FF")) +
scale_y_continuous(labels = scales::percent, expand=c(0,0), limits=c(0, 0.45), breaks=seq(0,1,0.1), minor_breaks=seq(0,1,0.05)) +
scale_shape_manual(values=c('IQ-TREE'=15, 'PhyML'=12, 'RAxML'=8, 'MetaPIGA'=17)) +
labs( x='\nUnassembled sites in simulated sequences',
y='incorrectly estimated topologies of subtrees with 4 taxa\n(standardized Quartett distance)\n',
colour='part of genome used\nfor tree reconstruction',
pch='algorithm') +
theme_bw() + theme(legend.position='bottom') + facet_grid(~TEAM)
file <- file.path(edir, paste(timetag,'_','QD_clumean_polvsall_by_gaps_taxan1600_Acute10pc_by_TEAM.pdf',sep=''))
ggsave(file=file, w=12, h=5, useDingbats=FALSE)
#
# quartett distance standardized by n choose 4 only IQ-Tree
#
tmp <- subset(sc, ACUTE=='low' & GENE=='gag+pol+env' & TEAM%in%c('IQ-TREE', 'PhyML', 'RAxML', 'MetaPIGA'))
tmp[, list(NQDme=mean(NQDme)), by=c('SC','GENE')]
set(tmp, NULL, 'TEAM', tmp[, factor(TEAM)])
set(tmp, NULL, 'GENE', tmp[, factor(GENE, levels=c('gag','pol','gag+pol+env'))])
ggplot(tmp, aes(x=GAPS)) +
geom_jitter(aes(y=NQDme, pch=TEAM, colour=GENE), position=position_jitter(w=0.8, h = 0), size=2) +
scale_colour_manual(values=c('gag'='red','pol'="grey60", 'gag+pol+env'="#3F4788FF")) +
scale_y_continuous(labels = scales::percent, expand=c(0,0), limits=c(0, 0.28), breaks=seq(0,1,0.1), minor_breaks=seq(0,1,0.05)) +
scale_shape_manual(values=c('IQ-TREE'=15, 'PhyML'=12, 'RAxML'=8, 'MetaPIGA'=17)) +
labs( x='\nUnassembled sites in simulated sequences',
y='proportion among all subtrees with 4 taxa\n',
colour='part of simulated genome\nused for tree reconstruction',
pch='tree reconstruction\nmethod') +
theme_bw() + theme(legend.position='bottom')
file <- file.path(edir, paste(timetag,'_','QD_clumean_polvsall_by_gaps_taxan1600_Acute10pc_IQTree.pdf',sep=''))
ggsave(file=file, w=4.5, h=6, useDingbats=FALSE)
#
# quartett distance standardized by n choose 4 MVR and BIONJ
#
tmp <- subset(sc, ACUTE=='low' & TEAM%in%c('RAxML','BioNJ','MVR'))
ggplot(tmp, aes(x=GAPS)) +
geom_jitter(aes(y=NQDme, colour=GENE, pch=TEAM), position=position_jitter(w=0.8, h = 0), size=2) +
scale_colour_manual(values=c('gag'='red','pol'="grey60", 'gag+pol+env'="#3F4788FF")) +
scale_y_continuous(labels = scales::percent, limits=c(0,1), expand=c(0,0)) +
#scale_shape_manual(values=c('IQ-TREE'=15, 'PhyML'=12, 'RAxML'=8, 'MetaPIGA'=17)) +
labs( x='\nUnassembled sites in simulated sequences',
y='incorrectly estimated topologies of subtrees with 4 taxa\n(standardized Quartett distance)\n',
colour='part of genome used\nfor tree reconstruction',
pch='algorithm') +
theme_bw() + theme(legend.position='bottom')
file <- file.path(edir, paste(timetag,'_','QD_clumean_polvsall_by_gaps_taxan1600_Acute10pc_MVRBioNJ.pdf',sep=''))
ggsave(file=file, w=5, h=7, useDingbats=FALSE)
#
# quartett distance standardized by n choose 4
#
ggplot(tmp, aes(x=GAPS)) +
geom_jitter(aes(y=NQDme, colour=GENE, pch=TEAM), position=position_jitter(w=0.8, h = 0), size=2) +
scale_colour_manual(values=c('gag'='red','pol'="grey60", 'gag+pol+env'="#3F4788FF")) +
scale_y_continuous(labels = scales::percent, expand=c(0,0), limits=c(0, 0.45)) +
scale_shape_manual(values=c('IQ-TREE'=15, 'PhyML'=12, 'RAxML'=8, 'MetaPIGA'=17)) +
labs( x='\nUnassembled sites in simulated sequences',
y='incorrectly estimated topologies of subtrees with 4 taxa\n(standardized Quartett distance)\n',
colour='part of genome used\nfor tree reconstruction',
pch='algorithm') +
theme_bw() + theme(legend.position='bottom')
file <- file.path(edir, paste(timetag,'_','QD_clumean_polvsall_by_gaps_taxan1600_Acute10pc.pdf',sep=''))
ggsave(file=file, w=5, h=7, useDingbats=FALSE)
#
# quartett distance by % gaps on x-axis
#
ggplot(subset(sc, TEAM=='IQ-TREE' & ACUTE=='low'), aes(x=GAPS_P)) +
#geom_point(aes(y=NQDme, colour=interaction(GENE,GAPS))) +
geom_boxplot(aes(y=NQDme, colour=GENE), width=0.1, outlier.shape=NA) +
scale_colour_manual(values=c('gag'='red','pol'="grey60", 'gag+pol+env'="#3F4788FF")) +
scale_x_continuous(labels = scales::percent) +
scale_y_continuous(labels = scales::percent, expand=c(0,0), limits=c(0, 0.35)) +
labs( x='\nUnassembled sites in simulated sequences',
y='incorrectly estimated topologies of subtrees with 4 taxa\n(standardized Quartett distance)\n',
colour='part of genome used\nfor tree reconstruction',
pch='algorithm') +
theme_bw() + theme(legend.position='bottom') + facet_grid(~GAPS, scales='free_x', space='free_x')
file <- file.path(edir, paste(timetag,'_','QD_clumean_polvsall_by_gapspc_taxan1600_Acute10pc.pdf',sep=''))
ggsave(file=file, w=6, h=5, useDingbats=FALSE)
#
# Quartett and KC metrics for IQ-TREE
#
tmp <- subset(sc, ACUTE=='low' & GENE!='pol' & TEAM%in%c('IQ-TREE','PhyML'))
set(tmp, NULL, 'TEAM', tmp[, factor(TEAM)])
set(tmp, NULL, 'GENE', tmp[, factor(GENE, levels=c('gag','pol','gag+pol+env'))])
set(tmp, NULL, 'NRFme', tmp[, 100*NRFme])
set(tmp, NULL, 'NQDme', tmp[, 100*NQDme])
tmp <- melt(tmp, measure.vars=c('NQDme','KCAme'))
set(tmp, tmp[, which(variable=='NQDme')], 'variable','incorrectly estimated topologies\nof subtrees of 4 taxa\n(proportion)')
set(tmp, tmp[, which(variable=='KCAme')], 'variable','error in reconstructed trees\n(std. Kendall-Colijn distance)\n')
ggplot(tmp, aes(x=GAPS)) +
geom_jitter(aes(y=value, colour=GENE), position=position_jitter(w=0.35, h = 0), size=1.5) +
scale_colour_manual(values=c('gag'="#FF7F00",'gag+pol+env'="grey50")) +
scale_y_continuous(limits=c(0,NA)) +
scale_shape_manual(values=c('IQ-TREE'=15, 'PhyML'=12, 'RAxML'=8, 'MetaPIGA'=17)) +
labs( x='\nUnassembled sites of PANGEA-HIV sequences,\ncopied into simulated sequences with known phylogenetic relationship',
y='',
colour='part of simulated genome\nused for tree reconstruction',
pch='algorithm') +
theme_bw() + theme(legend.position='bottom') + facet_grid(variable~TEAM, scales='free_y')
file <- file.path(edir, paste(timetag,'_','TOPOOTHER_clumean_polvsall_by_gaps_taxan1600_Acute10pc.pdf',sep=''))
ggsave(file=file, w=6.5, h=6.5, useDingbats=FALSE)
#
# all tree metrics
#
tmp <- subset(sc, ACUTE=='low' & TEAM%in%c('IQ-TREE', 'PhyML', 'RAxML', 'MetaPIGA'))
set(tmp, NULL, 'TEAM', tmp[, factor(TEAM)])
set(tmp, NULL, 'GENE', tmp[, factor(GENE, levels=c('gag','pol','gag+pol+env'))])
set(tmp, NULL, 'NRFme', tmp[, 100*NRFme])
set(tmp, NULL, 'NQDme', tmp[, 100*NQDme])
tmp <- melt(tmp, measure.vars=c('NRFme','NQDme','NPDSQme','KCAme'))
set(tmp, tmp[, which(variable=='NRFme')], 'variable','std. Robinson Fould distance\n(%)')
set(tmp, tmp[, which(variable=='NQDme')], 'variable','std. Quartett distance\n(%)')
set(tmp, tmp[, which(variable=='NPDSQme')], 'variable','Path distance\n(upper bound is 2)')
set(tmp, tmp[, which(variable=='KCAme')], 'variable','Kendall-Colijn distance\n(standardized)')
ggplot(tmp, aes(x=GAPS)) +
geom_jitter(aes(y=value, colour=GENE), position=position_jitter(w=0.8, h = 0), size=2) +
scale_colour_manual(values=c('gag'='red','pol'="grey60", 'gag+pol+env'="#3F4788FF")) +
scale_y_continuous(limits=c(0,NA)) +
scale_shape_manual(values=c('IQ-TREE'=15, 'PhyML'=12, 'RAxML'=8, 'MetaPIGA'=17)) +
labs( x='\nUnassembled sites in simulated sequences',
y='Distance between true and reconstructed tree topologies\n',
colour='part of genome used\nfor tree reconstruction',
pch='algorithm') +
theme_bw() + theme(legend.position='bottom') + facet_grid(variable~TEAM, scales='free_y')
file <- file.path(edir, paste(timetag,'_','TOPOOTHER_clumean_polvsall_by_gaps_taxan1600_Acute10pc.pdf',sep=''))
ggsave(file=file, w=15, h=15, useDingbats=FALSE)
#
# all tree metrics incl MVR BioNJ
#
tmp <- subset(sc, ACUTE=='low' & TEAM%in%c('IQ-TREE', 'PhyML', 'RAxML', 'MetaPIGA', 'MVR', 'BioNJ'))
set(tmp, NULL, 'TEAM', tmp[, factor(TEAM)])
set(tmp, NULL, 'GENE', tmp[, factor(GENE, levels=c('gag','pol','gag+pol+env'))])
set(tmp, NULL, 'NRFme', tmp[, 100*NRFme])
set(tmp, NULL, 'NQDme', tmp[, 100*NQDme])
tmp <- melt(tmp, measure.vars=c('NRFme','NQDme','NPDSQme'))
set(tmp, tmp[, which(variable=='NRFme')], 'variable','std. Robinson Fould distance\n(%)')
set(tmp, tmp[, which(variable=='NQDme')], 'variable','std. Quartett distance\n(%)')
set(tmp, tmp[, which(variable=='NPDSQme')], 'variable','Path distance\n(upper bound is 2)')
ggplot(tmp, aes(x=GAPS)) +
geom_jitter(aes(y=value, colour=GENE, pch=TEAM), position=position_jitter(w=0.8, h = 0), size=2) +
scale_colour_manual(values=c('gag'='red','pol'="grey60", 'gag+pol+env'="#3F4788FF")) +
scale_y_continuous(limits=c(0,NA)) +
scale_shape_manual(values=c('IQ-TREE'=15, 'PhyML'=12, 'RAxML'=8, 'MetaPIGA'=17, 'BioNJ'=7, 'MVR'=9)) +
labs( x='\nUnassembled sites in simulated sequences',
y='Distance between true and reconstructed tree topologies\n',
colour='part of genome used\nfor tree reconstruction',
pch='algorithm') +
theme_bw() + theme(legend.position='bottom') + facet_grid(variable~TEAM, scales='free_y')
file <- file.path(edir, paste(timetag,'_','TOPOOTHER_clumean_polvsall_by_gaps_taxan1600_Acute10pc_withMVR.pdf',sep=''))
ggsave(file=file, w=20, h=15, useDingbats=FALSE)
#
# increasing gap coverage with ExaML - missing sites
#
tmp <- subset(sc, TEAM=='RUNGAPS_ExaML' & !grepl('p51', GENE))
tmp <- subset(tmp, GENE=='gag' & RUNGAPS==0.02 | GENE=='gag (p17)' & RUNGAPS==0.02 | GENE=='gag+pol+env')
tmp[, MISSING_P:= (RUNGAPS*GENE_L + (6807-GENE_L))/6807]
set(tmp, NULL, 'GENE', tmp[, factor(GENE, levels=c('gag (p17)','gag','gag+pol+env'))])
ggplot(tmp, aes(x=MISSING_P)) +
geom_point(aes(y=NQDme, colour=GENE), size=2, pch=16) +
scale_colour_manual(values=c('gag (p17)'="#8C510A", 'gag'='red','gag+pol+env'="#3F4788FF")) +
scale_shape_manual(values=c('Botswana'=23, 'Uganda'=24)) +
scale_x_continuous(labels = scales::percent, expand=c(0,0), limits=c(0, 1), breaks=seq(0,1,0.1)) +
scale_y_continuous(labels = scales::percent, expand=c(0,0), limits=c(0, 0.3), breaks=seq(0,1,0.05)) +
#scale_shape_manual(values=c('IQ-TREE'=15, 'PhyML'=12, 'RAxML'=8, 'MetaPIGA'=17)) +
labs( x='\nProportion of missing sites, relative to gag+pol+env genome',
y='proportion among all subtrees with 4 taxa\n',
colour='part of simulated genome\nused for tree reconstruction') +
theme_bw() + theme(legend.position='bottom')
file <- file.path(edir, paste(timetag,'_','QD_clumean_p17full_by_missingsites_taxan1600_Acute10pc.pdf',sep=''))
ggsave(file=file, w=5, h=7)
#
# increasing gap coverage with ExaML
#
tmp <- subset(sc, TEAM=='RUNGAPS_ExaML' & !grepl('p51', GENE))
set(tmp, NULL, 'GENE', tmp[, factor(GENE, levels=c('gag (p17)','gag','gag+pol+env'))])
setkey(tmp, GENE, RUNGAPS)
tmp2 <- tmp[, list( RUNGAPS= RUNGAPS,
NQDmeSM= predict(loess(NQDme~RUNGAPS, span=5))),
by='GENE']
tmp <- merge(tmp, tmp2, by=c('GENE','RUNGAPS'))
tmp2 <- merge( rbind(data.table(GENE=c('gag (p17)','gag','gag+pol+env'), RUNGAPS=c(0.11, 0.08, 0.17), LOC='Botswana'), data.table(GENE=c('gag (p17)','gag','gag+pol+env'), RUNGAPS=c(0.21, 0.18, 0.47), LOC='Uganda')), tmp2,by=c('GENE','RUNGAPS'))
#set(tmp, NULL, 'GENE', tmp[, factor(GENE, levels=c('gag (p17)','gag','gag + pol (prot,p51)','gag+pol+env'))])
ggplot(tmp, aes(x=RUNGAPS)) +
geom_point(aes(y=NQDme, colour=GENE), size=2, pch=8) +
geom_line(aes(y=NQDmeSM, colour=GENE), size=0.5) +
geom_point(data=tmp2, aes(y=NQDmeSM, pch=LOC), size=2.5, fill='black') +
scale_colour_manual(values=c('gag (p17)'="#8C510A", 'gag'='red','gag+pol+env'="#3F4788FF")) +
scale_shape_manual(values=c('Botswana'=23, 'Uganda'=24)) +
scale_x_continuous(labels = scales::percent, expand=c(0,0), limits=c(0, 0.61), breaks=seq(0,1,0.1)) +
scale_y_continuous(labels = scales::percent, expand=c(0,0), limits=c(0, 0.4)) +
#scale_shape_manual(values=c('IQ-TREE'=15, 'PhyML'=12, 'RAxML'=8, 'MetaPIGA'=17)) +
labs( x='\nUnassembled sites in simulated sequences',
y='incorrectly estimated topologies of subtrees with 4 taxa\n(standardized Quartett distance)\n',
colour='part of genome used\nfor tree reconstruction',
pch='sampling location'
) +
theme_bw() + theme(legend.position='bottom')
file <- file.path(edir, paste(timetag,'_','QD_clumean_p17full_by_rungaps_taxan1600_Acute10pc.pdf',sep=''))
ggsave(file=file, w=5, h=7)
}
##--------------------------------------------------------------------------------------------------------
## olli 30.11.16
##--------------------------------------------------------------------------------------------------------
treecomparison.ana.161130.sclu<- function()
{
require(ggplot2)
require(data.table)
require(ape)
require(scales)
require(ggtree)
require(phangorn)
edir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/evaluation'
timetag <- '161123'
#
# merge
#
load(file.path(edir,'submitted_160713_09SBRL.rda'))
sc <- copy(sclu.info)
sa <- copy(submitted.info)
strs_rtt.160713 <- copy(strs_rtt)
load(file.path(edir,'submitted_170101_09SBRL.rda'))
sc <- rbind(sc, sclu.info, use.names=TRUE, fill=TRUE)
sa <- rbind(sa, submitted.info, use.names=TRUE, fill=TRUE)
trungps.tmp <- copy(trungps)
load(file.path(edir,'submitted_161123_07MSELSD_noTBRL.rda'))
sc <- rbind(sc, sclu.info, use.names=TRUE, fill=TRUE)
sa <- rbind(sa, submitted.info, use.names=TRUE, fill=TRUE)
set(sc, sc[, which(grepl('gag+pol+env',FILE,fixed=1))], 'GENE', 'GAG+POL+ENV')
submitted.info <- copy(sa)
sclu.info <- copy(sc)
trungps <- copy(trungps.tmp)
# get RUNGAPS column
sc[, RUNGAPS:=NA_real_]
tmp <- sc[, which(grepl('RUNGAPS',TEAM))]
set(sc, tmp, 'RUNGAPS', sc[tmp, as.numeric(gsub('.*TRAIN[0-9]([0-9][0-9]).*','\\1',regmatches(FILE,regexpr('TRAIN[0-9]+',FILE))))/100])
# get RUNGAPS_EXCL column
sc[, RUNGAPS_EXCL:=NA_real_]
set(sc, sc[, which(TEAM=='RUNGAPS_ExaML')], 'RUNGAPS_EXCL', 1)
tmp <- sc[, which(TEAM=='RUNGAPS_EXCLTAXA')]
set(sc, tmp, 'RUNGAPS_EXCL', sc[tmp, as.numeric(gsub('.*TRAIN[0-9][0-9][0-9]([0-9][0-9]).*','\\1',FILE))/100])
tmp <- sc[, which(TEAM=='RUNGAPS_EXCLSITE')]
set(sc, tmp, 'RUNGAPS_EXCL', sc[tmp, as.numeric(gsub('.*EXCLSITES([0-9][0-9]).*','\\1',FILE))/100])
# get PLEN column
sc[, PLEN:=NA_real_]
tmp <- sc[, which(TEAM=='PLEN')]
set(sc, tmp, 'PLEN', sc[tmp, as.numeric(gsub('PL','',regmatches(FILE, regexpr('PL[0-9]+',FILE))))])
# merge trungps
set(trungps, NULL, 'SC', trungps[, gsub('Regional','REGIONAL',gsub('_P17|_GAG|_FULL','',SC))])
set(trungps, NULL, 'GENE', trungps[, gsub('FULL','GAG+POL+ENV',GENE)])
tmp <- subset(trungps, !is.na(RUNGAPS) & !is.na(RUNGAPS_EXCL), c(SC, TEAM, GENE, RUNGAPS, RUNGAPS_EXCL, ACTG_P, UNASS_P, SITES_N))
tmp <- unique(tmp, by=c('SC','TEAM','GENE','RUNGAPS','RUNGAPS_EXCL','SITES_N'))
sc <- merge(sc, tmp, by=c('SC','TEAM','GENE','RUNGAPS','RUNGAPS_EXCL'), all.x=1)
sclu.info <- copy(sc)
#
# end: pre-processing
#
# count trees:
subset(sa, !grepl('GTR|TRAIN3|TRAIN5',SC) & OTHER=='N' & MODEL=='R' & GENE%in%c('GAG','GAG+POL+ENV') & !grepl('MVR|BioNJ|EXCLTAXA|EXCLSITE',TEAM))
#
#
#
sc <- copy(sclu.info)
sc <- merge(sc, data.table(GENE=c('P17','GAG','GAG+PARTIALPOL','POL','GAG+POL+ENV'), GENE_L=c(396, 1440, 3080, 2843, 6807)), by='GENE')
#
#tmp <- subset(tinfo, !is.na(IDCLU))[, list(CLU_N=CLU_N[1], MXGPS_CLU= max(GPS), MDGPS_CLU=median(GPS)), by=c('SC','IDCLU')]
#sc <- merge(sc, tmp, by=c('SC','IDCLU'))
set(sc, NULL, 'MODEL', sc[, factor(MODEL, levels=c('V','R'),labels=c('Model: Village','Model: Regional'))])
set(sc, sc[, which(SC=="VILL_99_APR15")],'SC',"150701_VILL_SCENARIO-C")
set(sc, NULL, 'SC', sc[, factor(SC, levels=c("150701_REGIONAL_TRAIN1", "150701_REGIONAL_TRAIN2", "150701_REGIONAL_TRAIN3", "150701_REGIONAL_TRAIN4","150701_REGIONAL_TRAIN5","161121_REGIONAL_TRAIN6","161121_REGIONAL_TRAIN7","161121_REGIONAL_TRAIN8","150701_VILL_SCENARIO-A","150701_VILL_SCENARIO-B","150701_VILL_SCENARIO-C","150701_VILL_SCENARIO-D","150701_VILL_SCENARIO-E","161121_REGIONAL_GTRFIXED1","161121_REGIONAL_GTRFIXED2","161121_REGIONAL_GTRFIXED3"),
labels=c('sc 1','sc 2','sc 3','sc 4','sc 5','sc 6','sc 7','sc 8','sc A','sc B','sc C','sc D','sc E',"161121_REGIONAL_GTRFIXED1","161121_REGIONAL_GTRFIXED2","161121_REGIONAL_GTRFIXED3"))])
set(sc, NULL, 'GAPS', sc[, factor(GAPS, levels=c('none','low','high'),labels=c('none','as for\nBotswana\nsequences','as for\nUganda\nsequences'))])
set(sc, NULL, 'BEST', sc[, factor(BEST, levels=c('Y','N'),labels=c('best tree','replicate tree'))])
set(sc, sc[, which(GENE=='P17')], 'GENE', 'gag (p17)')
set(sc, sc[, which(GENE=='GAG')], 'GENE', 'gag')
set(sc, sc[, which(GENE=='GAG+PARTIALPOL')], 'GENE', 'gag + pol (prot,p51)')
set(sc, sc[, which(GENE=='POL')], 'GENE', 'pol')
set(sc, sc[, which(GENE=='GAG+POL+ENV')], 'GENE', 'gag+pol+env')
set(sc, sc[, which(TEAM=='IQTree')], 'TEAM', 'IQ-TREE')
set(sc, sc[, which(TEAM=='RAXML')], 'TEAM', 'RAxML')
set(sc, NULL, 'EXT', sc[, factor(EXT, levels=c('~0pc','5pc'),labels=c('~ 0%/year','5%/year'))])
set(sc, NULL, 'ART', sc[, factor(ART, levels=c('none','fast'),labels=c('none','fast'))])
sc <- subset(sc, OTHER=='N')
#
sc <- sc[, list( NRFme=mean(NRFC, na.rm=TRUE),
NQDme=mean(NQDC, na.rm=TRUE),
NPDme=mean(NPD, na.rm=TRUE),
NPDSQme=mean(NPDSQ, na.rm=TRUE),
NRFmd=median(NRFC, na.rm=TRUE),
NQDmd=median(NQDC, na.rm=TRUE),
NPDmd=median(NPD, na.rm=TRUE),
NPDSQmd=median(NPDSQ, na.rm=TRUE)
), by=c('SC','GENE','GENE_L','TEAM','BEST','IDX','FILE','GAPS','UNASS_P','RUNGAPS','RUNGAPS_EXCL','PLEN','MODEL','TAXAN','TAXAN_T','ROOTED','SEQCOV','ART','ACUTE','EXT','OTHER','SUB_IDX_T','SITES_N')]
sc <- subset(sc, MODEL=='Model: Regional')
#
# patchy vs partial sequences on TRAIN1
#
tmp <- subset(sc, SC=='sc 1' &
((TEAM=='RUNGAPS_ExaML' & grepl('gag+pol+env',GENE,fixed=1)) |
(TEAM=='PLEN')))
tmp[, MISSING_P:= NA_real_]
tmp2 <- tmp[, which(TEAM=='RUNGAPS_ExaML')]
set(tmp, tmp2,'MISSING_P', tmp[tmp2, (RUNGAPS*GENE_L + (6807-GENE_L))/6807])
tmp2 <- tmp[, which(TEAM=='PLEN')]
set(tmp, tmp2,'MISSING_P', tmp[tmp2, 1-PLEN/6807])
set(tmp, NULL, 'TEAM', tmp[, factor(TEAM, levels=c('RUNGAPS_ExaML','PLEN'), labels=c('patchy gag+pol+env sequences','partial sequences'))])
ggplot(tmp, aes(x=MISSING_P)) +
geom_vline(aes(xintercept=1-1503/6807)) +
geom_text(x=1-1503/6807, y=0.05, label='HIV-1 gag gene', hjust=-.1, size=3) +
geom_point(aes(y=NQDme, colour=TEAM, pch=SC), size=2) +
scale_shape_manual(values=c('sc 1'=17)) +
scale_colour_manual(values=c('partial sequences'="#35978F",'patchy gag+pol+env sequences'="#3F4788FF")) +
scale_x_continuous(labels = scales::percent, expand=c(0,0), limits=c(0,0.609), breaks=seq(0,1,0.1)) +
scale_y_continuous(labels = scales::percent, expand=c(0,0), limits=c(0,0.2)) +
#scale_shape_manual(values=c('IQ-TREE'=15, 'PhyML'=12, 'RAxML'=8, 'MetaPIGA'=17)) +
labs( x='\nproportion of missing sites, relative to gag+pol+env genome',
y='proportion among all subtrees with 4 taxa\n',
colour='distribution of unassembled sites',
pch='sampling coverage') +
theme_bw() + theme(legend.position='bottom')
file <- file.path(edir, paste(timetag,'_','QD_clumean_by_missingsites.pdf',sep=''))
ggsave(file=file, w=4.5, h=6, useDingbats=FALSE)
#
# patchy vs partial sequences on TRAIN6
#
tmp <- subset(sc, SC=='sc 6' &
((TEAM=='RUNGAPS_ExaML' & grepl('gag+pol+env',GENE,fixed=1)) |
(TEAM=='PLEN')))
tmp[, MISSING_P:= NA_real_]
tmp2 <- tmp[, which(TEAM=='RUNGAPS_ExaML')]
set(tmp, tmp2,'MISSING_P', tmp[tmp2, (RUNGAPS*GENE_L + (6807-GENE_L))/6807])
tmp2 <- tmp[, which(TEAM=='PLEN')]
set(tmp, tmp2,'MISSING_P', tmp[tmp2, 1-PLEN/6807])
set(tmp, NULL, 'TEAM', tmp[, factor(TEAM, levels=c('RUNGAPS_ExaML','PLEN'), labels=c('patchy gag+pol+env sequences','partial sequences'))])
ggplot(tmp, aes(x=MISSING_P)) +
geom_vline(aes(xintercept=1-1503/6807)) +
geom_text(x=1-1503/6807, y=0.05, label='HIV-1 gag gene', hjust=-.1, size=3) +
geom_point(aes(y=NQDme, colour=TEAM, pch=SC), size=2) +
scale_shape_manual(values=c('sc 6'=16)) +
scale_colour_manual(values=c('partial sequences'="#35978F",'patchy gag+pol+env sequences'="#3F4788FF")) +
scale_x_continuous(labels = scales::percent, expand=c(0,0), limits=c(0, 0.609), breaks=seq(0,1,0.1)) +
scale_y_continuous(labels = scales::percent, expand=c(0,0), limits=c(0,0.2)) +
#scale_shape_manual(values=c('IQ-TREE'=15, 'PhyML'=12, 'RAxML'=8, 'MetaPIGA'=17)) +
labs( x='\nproportion of missing sites, relative to gag+pol+env genome',
y='proportion among all subtrees with 4 taxa\n',
colour='distribution of unassembled sites',
pch='sampling coverage') +
theme_bw() + theme(legend.position='bottom')
file <- file.path(edir, paste(timetag,'_','QD_clumean_by_missingsites_highcoverage.pdf',sep=''))
ggsave(file=file, w=4.5, h=6, useDingbats=FALSE)
#
# patchy vs partial sequences on TRAIN7
#
tmp <- subset(sc, SC=='sc 7' &
((TEAM=='RUNGAPS_ExaML' & grepl('gag+pol+env',GENE,fixed=1)) |
(TEAM=='PLEN')))
tmp[, MISSING_P:= NA_real_]
tmp2 <- tmp[, which(TEAM=='RUNGAPS_ExaML')]
set(tmp, tmp2,'MISSING_P', tmp[tmp2, (RUNGAPS*GENE_L + (6807-GENE_L))/6807])
tmp2 <- tmp[, which(TEAM=='PLEN')]
set(tmp, tmp2,'MISSING_P', tmp[tmp2, 1-PLEN/6807])
set(tmp, NULL, 'TEAM', tmp[, factor(TEAM, levels=c('RUNGAPS_ExaML','PLEN'), labels=c('patchy gag+pol+env sequences','partial sequences'))])
ggplot(tmp, aes(x=MISSING_P)) +
geom_vline(aes(xintercept=1-1503/6807)) +
geom_text(x=1-1503/6807, y=0.05, label='HIV-1 gag gene', hjust=-.1, size=3) +
geom_point(aes(y=NQDme, colour=TEAM, pch=SC), size=2) +
scale_shape_manual(values=c('sc 7'=18)) +
scale_colour_manual(values=c('partial sequences'="#35978F",'patchy gag+pol+env sequences'="#3F4788FF")) +
scale_x_continuous(labels = scales::percent, expand=c(0,0), limits=c(0,0.609), breaks=seq(0,1,0.1)) +
scale_y_continuous(labels = scales::percent, expand=c(0,0), limits=c(0,0.2)) +
#scale_shape_manual(values=c('IQ-TREE'=15, 'PhyML'=12, 'RAxML'=8, 'MetaPIGA'=17)) +
labs( x='\nproportion of missing sites, relative to gag+pol+env genome',
y='proportion among all subtrees with 4 taxa\n',
colour='distribution of unassembled sites',
pch='sampling coverage') +
theme_bw() + theme(legend.position='bottom')
file <- file.path(edir, paste(timetag,'_','QD_clumean_by_missingsites_seqcov31.pdf',sep=''))
ggsave(file=file, w=4.5, h=6, useDingbats=FALSE)
#
# patchy vs partial sequences on TRAIN8
#
tmp <- subset(sc, SC=='sc 8' &
((TEAM=='RUNGAPS_ExaML' & grepl('gag+pol+env',GENE,fixed=1)) |
(TEAM=='PLEN')))
tmp[, MISSING_P:= NA_real_]
tmp2 <- tmp[, which(TEAM=='RUNGAPS_ExaML')]
set(tmp, tmp2,'MISSING_P', tmp[tmp2, (RUNGAPS*GENE_L + (6807-GENE_L))/6807])
tmp2 <- tmp[, which(TEAM=='PLEN')]
set(tmp, tmp2,'MISSING_P', tmp[tmp2, 1-PLEN/6807])
set(tmp, NULL, 'TEAM', tmp[, factor(TEAM, levels=c('RUNGAPS_ExaML','PLEN'), labels=c('patchy gag+pol+env sequences','partial sequences'))])
ggplot(tmp, aes(x=MISSING_P)) +
geom_point(aes(y=NQDme, colour=TEAM, pch=SC), size=2) +
scale_colour_manual(values=c('partial sequences'="#35978F",'patchy gag+pol+env sequences'="#3F4788FF")) +
scale_x_continuous(labels = scales::percent, expand=c(0,0), limits=c(0,0.609), breaks=seq(0,1,0.1)) +
scale_y_continuous(labels = scales::percent, expand=c(0,0), limits=c(0,0.2)) +
scale_shape_manual(values=c('sc 8'=8)) +
labs( x='\nproportion of missing sites, relative to gag+pol+env genome',
y='proportion among all subtrees with 4 taxa\n',
colour='distribution of unassembled sites',
pch='sampling coverage') +
theme_bw() + theme(legend.position='bottom')
file <- file.path(edir, paste(timetag,'_','QD_clumean_by_missingsites_seqcov15.pdf',sep=''))
ggsave(file=file, w=4.5, h=6, useDingbats=FALSE)
#
# patchy sequences by seq coverage
#
tmp <- subset(sc, TEAM=='RUNGAPS_ExaML' & !grepl('TRAIN2',FILE) & !grepl('sc 6',SC) & grepl('gag+pol+env',GENE,fixed=1))
tmp[, MISSING_P:= NA_real_]
tmp2 <- tmp[, which(TEAM=='RUNGAPS_ExaML')]
set(tmp, tmp2,'MISSING_P', tmp[tmp2, (RUNGAPS*GENE_L + (6807-GENE_L))/6807])
set(tmp, NULL, 'SC', tmp[, factor(as.character(SC), levels=c('sc 1','sc 8','sc 7','sc 6'), labels=c('6%','15%','31%','60%'))])
ggplot(tmp, aes(x=MISSING_P)) +
geom_point(aes(y=NQDme, pch=SC), size=2, colour="#3F4788FF") +
#geom_smooth(aes(y=NQDme, colour=SC), se=FALSE, method='lm', size=0.7) +
scale_shape_manual(values=c('6%'=17,'15%'=1,'31%'=8,'60%'=16)) +
#scale_colour_manual(values=c('6%'="#3F4788FF",'15%'="#0570B0",'31%'="#3690C0",'60%'="#74A9CF")) +
scale_x_continuous(labels = scales::percent, expand=c(0,0), limits=c(0,0.609), breaks=seq(0,1,0.1)) +
scale_y_continuous(labels = scales::percent, expand=c(0,0), limits=c(0,0.2)) +
#scale_shape_manual(values=c('IQ-TREE'=15, 'PhyML'=12, 'RAxML'=8, 'MetaPIGA'=17)) +
labs( x='\nproportion of missing sites, relative to gag+pol+env genome',
y='proportion among all subtrees with 4 taxa\n',
colour='sampling coverage',
pch='sampling coverage') +
theme_bw() + theme(legend.position='bottom')
file <- file.path(edir, paste(timetag,'_','QD_clumean_by_missingsites_variedseqcov.pdf',sep=''))
file <- file.path(edir, paste(timetag,'_','QD_clumean_by_missingsites_variedseqcov2.pdf',sep=''))
ggsave(file=file, w=4.5, h=6, useDingbats=FALSE)
#
# repeat QD dist with control run (no model misspec)
# currently no improvement whatsoever, potentially due to using -D
#
tmp <- subset(sc, ACUTE=='low' & GENE=='gag+pol+env' & TEAM%in%c('GTRFIXED','IQ-TREE', 'PhyML', 'RAxML', 'MetaPIGA'))
#tmp[, list(NQDme=mean(NQDme)), by=c('SC','GENE')]
set(tmp, NULL, 'TEAM', tmp[, factor(TEAM)])
set(tmp, NULL, 'GENE', tmp[, factor(GENE, levels=c('gag','pol','gag+pol+env'))])
ggplot(tmp, aes(x=GAPS)) +
geom_jitter(aes(y=NQDme, pch=TEAM, colour=GENE), position=position_jitter(w=0.4, h = 0), size=2) +
scale_colour_manual(values=c('gag'='red','pol'="grey60", 'gag+pol+env'="#3F4788FF")) +
scale_y_continuous(labels = scales::percent, expand=c(0,0), limits=c(0, 0.28), breaks=seq(0,1,0.1), minor_breaks=seq(0,1,0.05)) +
scale_shape_manual(values=c('GTRFIXED'=10, 'IQ-TREE'=15, 'PhyML'=12, 'RAxML'=8, 'MetaPIGA'=17)) +
labs( x='\nUnassembled sites in simulated sequences',
y='proportion among all subtrees with 4 taxa\n',
colour='part of simulated genome\nused for tree reconstruction',
pch='tree reconstruction\nmethod') +
theme_bw() + theme(legend.position='bottom')
file <- file.path(edir, paste(timetag,'_','QD_clumean_by_missingsites_withGTRfixed.pdf',sep=''))
ggsave(file=file, w=4.5, h=6, useDingbats=FALSE)
#
# consider excluding columns with most gaps, x-axis before taxa excluded
#
tmp <- subset(sc, SC=='sc 2' & ACUTE=='low' & GENE=='gag+pol+env' & TEAM=='RUNGAPS_EXCLSITE',c(TEAM,RUNGAPS_EXCL,RUNGAPS,NQDme, SITES_N,UNASS_P))
tmp2 <- subset(sc, SC=='sc 2' & ACUTE=='low' & GENE=='gag+pol+env' & TEAM=='RUNGAPS_ExaML',c(TEAM,RUNGAPS_EXCL,RUNGAPS,NQDme,UNASS_P))
setnames(tmp2, 'RUNGAPS_EXCL','DUMMY')
tmp2 <- merge(tmp2, as.data.table(expand.grid(RUNGAPS_EXCL=c(0.2,0.3,0.4,0.5), DUMMY=1)), by='DUMMY', allow.cartesian=TRUE)
tmp2[, DUMMY:=NULL]
tmp2[, SITES_N:= 6783L]
tmp <- rbind(tmp, tmp2)
set(tmp, NULL, 'MISSING_P', tmp[, (RUNGAPS*SITES_N + (6783-SITES_N))/6783])
set(tmp, NULL, 'RUNGAPS_EXCL', tmp[, factor(RUNGAPS_EXCL, levels=c(.2,.3,.4,.5), labels=c("alignment positions with\n>20% unassembled characters\nexcluded","alignment positions with\n>30% unassembled characters\nexcluded","alignment positions with\n>40% unassembled characters\nexcluded","alignment positions with\n>50% unassembled characters\nexcluded"))])
# plot by x-axis RUNGAPS before sites excluded
ggplot(tmp, aes(x=RUNGAPS)) +
geom_point(aes(y=NQDme, colour=TEAM)) +
scale_colour_manual(values=c('RUNGAPS_ExaML'="grey50",'RUNGAPS_EXCLSITE'="#FF7F00")) +
#scale_colour_brewer(palette='Paired') +
scale_x_continuous(labels=scales::percent, expand=c(0,0), breaks=seq(0,1,0.1), limits=c(0,0.63)) +
scale_y_continuous(labels=scales::percent, expand=c(0,0), limits=c(0, 0.35), breaks=seq(0,1,0.05), minor_breaks=seq(0,1,0.01)) +
facet_grid(~RUNGAPS_EXCL) +
theme_bw() + theme(legend.position='bottom') +
labs( x='\nUnassembled sites in simulated sequences',
y='incorrectly estimated subtrees with 4 taxa\n(proportion)\n',
colour='alignment columns excluded\nbefore tree reconstruction',
pch='alignment columns excluded\nbefore tree reconstruction')
file <- file.path(edir, paste(timetag,'_','QD_clumean_by_excludedsites.pdf',sep=''))
ggsave(file=file, w=12, h=5, useDingbats=FALSE)
ggplot(tmp, aes(x=MISSING_P)) +
geom_point(aes(y=NQDme, colour=TEAM)) +
scale_colour_manual(values=c('RUNGAPS_ExaML'="grey50",'RUNGAPS_EXCLSITE'="#FF7F00")) +
#scale_colour_brewer(palette='Paired') +
scale_x_continuous(labels=scales::percent, expand=c(0,0), breaks=seq(0,1,0.1), limits=c(0,1)) +
scale_y_continuous(labels=scales::percent, expand=c(0,0), limits=c(0, 0.35), breaks=seq(0,1,0.05), minor_breaks=seq(0,1,0.01)) +
facet_grid(~RUNGAPS_EXCL) +
theme_bw() + theme(legend.position='bottom') +
labs( x='\nMissing characters in simulated sequences relative to gag+pol+env',
y='incorrectly estimated subtrees with 4 taxa\n(proportion)\n',
colour='alignment columns excluded\nbefore tree reconstruction',
pch='alignment columns excluded\nbefore tree reconstruction')
file <- file.path(edir, paste(timetag,'_','QD_clumean_by_excludedsites_missingp.pdf',sep=''))
ggsave(file=file, w=12, h=5, useDingbats=FALSE)
#tmp2 <- tmp[, list( RUNGAPS= RUNGAPS,
# YM= predict(loess(NQDme~RUNGAPS, span=5, degree=2))),
# by='RUNGAPS_EXCL']
tmp <- merge(tmp, tmp2, by=c('RUNGAPS_EXCL','RUNGAPS'))
ggplot(subset(tmp, RUNGAPS_EXCL%in%c(">20% unassembled characters",">30% unassembled characters",">50% unassembled characters","none")), aes(x=RUNGAPS)) +
geom_point(data=subset(tmp, RUNGAPS_EXCL%in%c(">20% unassembled characters",">30% unassembled characters",">50% unassembled characters","none")), aes(y=NQDme, colour=RUNGAPS_EXCL, pch=RUNGAPS_EXCL), size=2, show.legend=FALSE) +
geom_line(aes(y=YM, colour=RUNGAPS_EXCL)) +
scale_colour_brewer(palette='Set1') +
scale_x_continuous(labels=scales::percent, expand=c(0,0), breaks=seq(0,1,0.1), limits=c(0,0.63)) +
scale_y_continuous(labels=scales::percent, expand=c(0,0), limits=c(0, 0.35), breaks=seq(0,1,0.05), minor_breaks=seq(0,1,0.01)) +
labs( x='\nUnassembled sites in simulated sequences',
y='proportion among all subtrees with 4 taxa\n',
colour='alignment columns excluded\nbefore tree reconstruction',
pch='alignment columns excluded\nbefore tree reconstruction') +
theme_bw() + theme(legend.position='bottom') +
facet_grid(~RUNGAPS_EXCL)
file <- file.path(edir, paste(timetag,'_','QD_clumean_by_excludedsites.pdf',sep=''))
ggsave(file=file, w=4.5, h=6, useDingbats=FALSE)
#
# consider excluding taxa with most gaps
# and re-calculate Quartet distance only on common phylogeny
#
tmp <- subset(sc, SC=='sc 2' & ACUTE=='low' & GENE=='gag+pol+env' & TEAM=='RUNGAPS_EXCLTAXA', c(TEAM, RUNGAPS, RUNGAPS_EXCL,IDX, SUB_IDX_T, UNASS_P))
tmp <- dcast.data.table(tmp, RUNGAPS+RUNGAPS_EXCL+SUB_IDX_T+UNASS_P~TEAM, value.var='IDX')
tmp2 <- subset(sc, SC=='sc 2' & ACUTE=='low' & GENE=='gag+pol+env' & TEAM=='RUNGAPS_ExaML', c(RUNGAPS, IDX))
tmp <- merge(tmp, tmp2, by='RUNGAPS')
tmp[, IDX_NEW:= seq_len(nrow(tmp))]
# create new vector of trees without taxa that are not in the excluded analysis
ph_tmp <- vector('list', nrow(tmp)) # the trees for RUNGAPS_ExaML are stored in the 160713 version
for(i in seq_len(nrow(tmp)))
{
#i <- 1
ph.full <- strs_rtt.160713[[tmp[i,IDX]]]
ph.excl <- strs_rtt[[tmp[i,RUNGAPS_EXCLTAXA]]]
ph <- drop.tip(ph.full, setdiff(ph.full$tip.label, ph.excl$tip.label))
stopifnot( Ntip(ph)==Ntip(ph.excl) )
ph_tmp[[i]] <- ph
}
# run quartet distances on cluster
setnames(tmp, c('IDX','IDX_NEW'), c("IDX_OLD",'IDX'))
tmp3 <- copy(tmp)
save(ph_tmp, tmp, ttrs, tinfo, file=file.path(edir, paste0(timetag,'_extraQD.rda')))
load(file.path(edir,'161123_extraQD_01extra.rda'))
tmp <- merge(tmp3, tmp2, by='IDX')
tmp3 <- tmp[, list(NQDme=mean(NQDC, na.rm=TRUE)), by=c('IDX','IDX_OLD','RUNGAPS','RUNGAPS_EXCL','RUNGAPS_EXCLTAXA','UNASS_P')]
tmp3[, TEAM:='RUNGAPS_ExaML']
tmp <- subset(sc, SC=='sc 2' & ACUTE=='low' & GENE=='gag+pol+env' & TEAM%in%c('RUNGAPS_EXCLTAXA'), c(TEAM,RUNGAPS_EXCL,RUNGAPS,NQDme, UNASS_P))
tmp <- rbind(tmp,subset(tmp3, select=c(TEAM, RUNGAPS_EXCL, RUNGAPS, NQDme, UNASS_P)), fill=TRUE, use.names=TRUE)
set(tmp, NULL, 'RUNGAPS_EXCL', tmp[, factor(RUNGAPS_EXCL, levels=c(0.5,.6,.7,.8,.9), labels=c(">50% unassembled sites",">60% unassembled sites",">70% unassembled sites",">80% unassembled sites",">90% unassembled sites"))])
set(tmp, NULL, 'LEGEND', tmp[, paste('sequences with\n',as.character(RUNGAPS_EXCL),'\nexcluded')])
# plot x-axis AFTER taxa excluded
ggplot(subset(tmp,RUNGAPS_EXCL!=">50% unassembled sites"), aes(x=UNASS_P)) +
geom_point(aes(y=NQDme, colour=TEAM)) +
scale_colour_manual(values=c('RUNGAPS_ExaML'="grey50",'RUNGAPS_EXCLTAXA'="#FF7F00")) +
#scale_colour_brewer(palette='Paired') +
scale_x_continuous(labels=scales::percent, expand=c(0,0), breaks=seq(0,1,0.1), limits=c(0,0.63)) +
scale_y_continuous(labels=scales::percent, expand=c(0,0), limits=c(0, 0.25), breaks=seq(0,1,0.05), minor_breaks=seq(0,1,0.01)) +
facet_grid(~LEGEND) +
theme_bw() + theme(legend.position='bottom') +
labs( x='\nUnassembled sites in simulated sequences after sequences excluded',
y='incorrectly estimated common subtrees with 4 taxa\n(proportion)\n',
colour='sequences excluded\nbefore tree reconstruction',
pch='sequences excluded\nbefore tree reconstruction')
file <- file.path(edir, paste(timetag,'_','QD_clumean_by_excludedtaxa_samenumbertaxa_pcafterexcluded.pdf',sep=''))
ggsave(file=file, w=12, h=5, useDingbats=FALSE)
# plot x-axis BEFORE taxa excluded
ggplot(subset(tmp,RUNGAPS_EXCL!=">50% unassembled sites"), aes(x=RUNGAPS)) +
geom_point(aes(y=NQDme, colour=TEAM)) +
scale_colour_manual(values=c('RUNGAPS_ExaML'="grey50",'RUNGAPS_EXCLTAXA'="#FF7F00")) +
#scale_colour_brewer(palette='Paired') +
scale_x_continuous(labels=scales::percent, expand=c(0,0), breaks=seq(0,1,0.1), limits=c(0,0.63)) +
scale_y_continuous(labels=scales::percent, expand=c(0,0), limits=c(0, 0.25), breaks=seq(0,1,0.05), minor_breaks=seq(0,1,0.01)) +
facet_grid(~LEGEND) +
theme_bw() + theme(legend.position='bottom') +
labs( x='\nUnassembled sites in simulated sequences',
y='incorrectly estimated common subtrees with 4 taxa\n(proportion)\n',
colour='sequences excluded\nbefore tree reconstruction',
pch='sequences excluded\nbefore tree reconstruction')
file <- file.path(edir, paste(timetag,'_','QD_clumean_by_excludedtaxa_samenumbertaxa.pdf',sep=''))
ggsave(file=file, w=12, h=5, useDingbats=FALSE)
#
# consider excluding taxa with most gaps, x-axis before taxa excluded
#
tmp <- subset(sc, SC=='sc 2' & ACUTE=='low' & GENE=='gag+pol+env' & TEAM%in%c('RUNGAPS_EXCLTAXA','RUNGAPS_ExaML'))
set(tmp, NULL, 'RUNGAPS_EXCL', tmp[, factor(RUNGAPS_EXCL, levels=c(0.5,.6,.7,.8,.9,1), labels=c(">50% unassembled sites",">60% unassembled sites",">70% unassembled sites",">80% unassembled sites",">90% unassembled sites","none"))])
tmp2 <- tmp[, list( RUNGAPS= RUNGAPS,
YM= predict(loess(NQDme~RUNGAPS, span=5, degree=2))),
by='RUNGAPS_EXCL']
tmp <- merge(tmp, tmp2, by=c('RUNGAPS_EXCL','RUNGAPS'))
ggplot(subset(tmp, RUNGAPS_EXCL%in%c(">60% unassembled sites",">80% unassembled sites",">90% unassembled sites","none")), aes(x=RUNGAPS)) +
geom_point(data=subset(tmp, RUNGAPS_EXCL%in%c(">80% unassembled sites","none")), aes(y=NQDme, colour=RUNGAPS_EXCL, pch=RUNGAPS_EXCL), size=2, show.legend=FALSE) +
geom_line(aes(y=YM, colour=RUNGAPS_EXCL)) +
scale_colour_brewer(palette='Set1') +
scale_x_continuous(labels=scales::percent, expand=c(0,0), breaks=seq(0,1,0.1), limits=c(0,0.63)) +
scale_y_continuous(labels=scales::percent, expand=c(0,0), limits=c(0, 0.25), breaks=seq(0,1,0.05), minor_breaks=seq(0,1,0.01)) +
labs( x='\nUnassembled sites in simulated sequences',
y='proportion among all subtrees with 4 taxa\n',
colour='sequences excluded\nbefore tree reconstruction',
pch='sequences excluded\nbefore tree reconstruction') +
theme_bw() + theme(legend.position='bottom')
file <- file.path(edir, paste(timetag,'_','QD_clumean_by_excludedtaxa.pdf',sep=''))
ggsave(file=file, w=4.5, h=6, useDingbats=FALSE)
#
# consider excluding taxa with most gaps, x-axis after taxa excluded
#
tmp <- subset(sc, ACUTE=='low' & GENE=='gag+pol+env' & TEAM%in%c('RUNGAPS2','RUNGAPS_ExaML'))
set(tmp, NULL, 'RUNGAPS_EXCL', tmp[, factor(RUNGAPS_EXCL, levels=c(0.5,.6,.7,.8,.9,1), labels=c(">50% unassembled sites",">60% unassembled sites",">70% unassembled sites",">80% unassembled sites",">90% unassembled sites","none"))])
tmp2 <- tmp[, list( UNASS_P= UNASS_P,
YM= predict(loess(NQDme~UNASS_P, span=5, degree=2))),
by='RUNGAPS_EXCL']
tmp <- merge(tmp, tmp2, by=c('RUNGAPS_EXCL','UNASS_P'))
ggplot(subset(tmp, RUNGAPS_EXCL%in%c(">60% unassembled sites",">80% unassembled sites",">90% unassembled sites","none")), aes(x=UNASS_P)) +
geom_point(data=subset(tmp, RUNGAPS_EXCL%in%c(">80% unassembled sites","none")), aes(y=NQDme, colour=RUNGAPS_EXCL, pch=RUNGAPS_EXCL), size=2, show.legend=FALSE) +
geom_line(aes(y=YM, colour=RUNGAPS_EXCL)) +
scale_colour_brewer(palette='Set1') +
scale_x_continuous(labels=scales::percent, expand=c(0,0), breaks=seq(0,1,0.1), limits=c(0,0.63)) +
scale_y_continuous(labels=scales::percent, expand=c(0,0), limits=c(0, 0.25), breaks=seq(0,1,0.05), minor_breaks=seq(0,1,0.01)) +
labs( x='\nUnassembled sites in simulated sequences',
y='proportion among all subtrees with 4 taxa\n',
colour='sequences excluded\nbefore tree reconstruction',
pch='sequences excluded\nbefore tree reconstruction') +
theme_bw() + theme(legend.position='bottom')
file <- file.path(edir, paste(timetag,'_','QD_clumean_by_unassembledafterexclusion.pdf',sep=''))
ggsave(file=file, w=4.5, h=6, useDingbats=FALSE)
}
treecomparison.ana.170424.sclu<- function()
{
require(ggplot2)
require(data.table)
require(ape)
require(scales)
require(ggtree)
require(phangorn)
edir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/evaluation'
timetag <- '170424'
#
# merge
#
load(file.path(edir,'submitted_160713_09SBRL.rda'))
sc <- copy(sclu.info)
sa <- copy(submitted.info)
strs_rtt.160713 <- copy(strs_rtt)
load(file.path(edir,'submitted_170101_09SBRL.rda'))
sc <- rbind(sc, sclu.info, use.names=TRUE, fill=TRUE)
sa <- rbind(sa, submitted.info, use.names=TRUE, fill=TRUE)
trungps.tmp <- copy(trungps)
load(file.path(edir,'submitted_170424_09SBRL.rda'))
sc <- rbind(sc, sclu.info, use.names=TRUE, fill=TRUE)
sa <- rbind(sa, submitted.info, use.names=TRUE, fill=TRUE)
#trungps <- copy(trungps.tmp)
load(file.path(edir,'submitted_161123_07MSELSD_noTBRL.rda'))
sc <- rbind(sc, sclu.info, use.names=TRUE, fill=TRUE)
sa <- rbind(sa, submitted.info, use.names=TRUE, fill=TRUE)
set(sc, sc[, which(grepl('gag+pol+env',FILE,fixed=1))], 'GENE', 'GAG+POL+ENV')
submitted.info <- copy(sa)
sclu.info <- copy(sc)
trungps <- copy(trungps.tmp)
# get RUNGAPS column
sc[, RUNGAPS:=NA_real_]
tmp <- sc[, which(grepl('RUNGAPS',TEAM))]
set(sc, tmp, 'RUNGAPS', sc[tmp, as.numeric(gsub('.*TRAIN[0-9]([0-9][0-9]).*','\\1',regmatches(FILE,regexpr('TRAIN[0-9]+',FILE))))/100])
# get RUNGAPS_EXCL column
sc[, RUNGAPS_EXCL:=NA_real_]
set(sc, sc[, which(TEAM=='RUNGAPS_ExaML')], 'RUNGAPS_EXCL', 1)
tmp <- sc[, which(TEAM=='RUNGAPS_EXCLTAXA')]
set(sc, tmp, 'RUNGAPS_EXCL', sc[tmp, as.numeric(gsub('.*TRAIN[0-9][0-9][0-9]([0-9][0-9]).*','\\1',FILE))/100])
tmp <- sc[, which(TEAM=='RUNGAPS_EXCLSITE')]
set(sc, tmp, 'RUNGAPS_EXCL', sc[tmp, as.numeric(gsub('.*EXCLSITES([0-9][0-9]).*','\\1',FILE))/100])
# get PLEN column
sc[, PLEN:=NA_real_]
tmp <- sc[, which(TEAM=='PLEN')]
set(sc, tmp, 'PLEN', sc[tmp, as.numeric(gsub('PL','',regmatches(FILE, regexpr('PL[0-9]+',FILE))))])
# merge trungps
set(trungps, NULL, 'SC', trungps[, gsub('Regional','REGIONAL',gsub('_P17|_GAG|_FULL','',SC))])
set(trungps, NULL, 'GENE', trungps[, gsub('FULL','GAG+POL+ENV',GENE)])
tmp <- subset(trungps, !is.na(RUNGAPS) & !is.na(RUNGAPS_EXCL), c(SC, TEAM, GENE, RUNGAPS, RUNGAPS_EXCL, ACTG_P, UNASS_P, SITES_N))
tmp <- unique(tmp, by=c('SC','TEAM','GENE','RUNGAPS','RUNGAPS_EXCL','SITES_N'))
sc <- merge(sc, tmp, by=c('SC','TEAM','GENE','RUNGAPS','RUNGAPS_EXCL'), all.x=1)
sclu.info <- copy(sc)
#
# end: pre-processing
#
# count trees:
subset(sa, !grepl('GTR|TRAIN3|TRAIN5',SC) & OTHER=='N' & MODEL=='R' & GENE%in%c('GAG','GAG+POL+ENV') & !grepl('MVR|BioNJ|EXCLTAXA|EXCLSITE',TEAM))
#
#
#
sc <- copy(sclu.info)
sc <- merge(sc, data.table(GENE=c('P17','GAG','GAG+PARTIALPOL','POL','GAG+POL+ENV'), GENE_L=c(396, 1440, 3080, 2843, 6807)), by='GENE')
#
#tmp <- subset(tinfo, !is.na(IDCLU))[, list(CLU_N=CLU_N[1], MXGPS_CLU= max(GPS), MDGPS_CLU=median(GPS)), by=c('SC','IDCLU')]
#sc <- merge(sc, tmp, by=c('SC','IDCLU'))
set(sc, NULL, 'MODEL', sc[, factor(MODEL, levels=c('V','R'),labels=c('Model: Village','Model: Regional'))])
set(sc, sc[, which(SC=="VILL_99_APR15")],'SC',"150701_VILL_SCENARIO-C")
set(sc, NULL, 'SC', sc[, factor(SC, levels=c("150701_REGIONAL_TRAIN1", "150701_REGIONAL_TRAIN2", "150701_REGIONAL_TRAIN3", "150701_REGIONAL_TRAIN4","150701_REGIONAL_TRAIN5","161121_REGIONAL_TRAIN6","161121_REGIONAL_TRAIN7","161121_REGIONAL_TRAIN8","150701_VILL_SCENARIO-A","150701_VILL_SCENARIO-B","150701_VILL_SCENARIO-C","150701_VILL_SCENARIO-D","150701_VILL_SCENARIO-E","161121_REGIONAL_GTRFIXED1","161121_REGIONAL_GTRFIXED2","161121_REGIONAL_GTRFIXED3"),
labels=c('sc 1','sc 2','sc 3','sc 4','sc 5','sc 6','sc 7','sc 8','sc A','sc B','sc C','sc D','sc E',"161121_REGIONAL_GTRFIXED1","161121_REGIONAL_GTRFIXED2","161121_REGIONAL_GTRFIXED3"))])
set(sc, NULL, 'GAPS', sc[, factor(GAPS, levels=c('none','low','high'),labels=c('none','as for\nBotswana\nsequences','as for\nUganda\nsequences'))])
set(sc, NULL, 'BEST', sc[, factor(BEST, levels=c('Y','N'),labels=c('best tree','replicate tree'))])
set(sc, sc[, which(GENE=='P17')], 'GENE', 'gag (p17)')
set(sc, sc[, which(GENE=='GAG')], 'GENE', 'gag')
set(sc, sc[, which(GENE=='GAG+PARTIALPOL')], 'GENE', 'gag + pol (prot,p51)')
set(sc, sc[, which(GENE=='POL')], 'GENE', 'pol')
set(sc, sc[, which(GENE=='GAG+POL+ENV')], 'GENE', 'gag+pol+env')
set(sc, sc[, which(TEAM=='IQTree')], 'TEAM', 'IQ-TREE')
set(sc, sc[, which(TEAM=='RAXML')], 'TEAM', 'RAxML')
set(sc, NULL, 'EXT', sc[, factor(EXT, levels=c('~0pc','5pc'),labels=c('~ 0%/year','5%/year'))])
set(sc, NULL, 'ART', sc[, factor(ART, levels=c('none','fast'),labels=c('none','fast'))])
sc <- subset(sc, OTHER=='N')
#
sc <- sc[, list( NRFme=mean(NRFC, na.rm=TRUE),
NQDme=mean(NQDC, na.rm=TRUE),
NPDme=mean(NPD, na.rm=TRUE),
NPDSQme=mean(NPDSQ, na.rm=TRUE),
NRFmd=median(NRFC, na.rm=TRUE),
NQDmd=median(NQDC, na.rm=TRUE),
NPDmd=median(NPD, na.rm=TRUE),
NPDSQmd=median(NPDSQ, na.rm=TRUE)
), by=c('SC','GENE','GENE_L','TEAM','BEST','IDX','FILE','GAPS','UNASS_P','RUNGAPS','RUNGAPS_EXCL','PLEN','MODEL','TAXAN','TAXAN_T','ROOTED','SEQCOV','ART','ACUTE','EXT','OTHER','SUB_IDX_T','SITES_N')]
sc <- subset(sc, MODEL=='Model: Regional')
#
# QD dist with FASTTREE
#
tmp <- subset(sc, ACUTE=='low' & GENE=='gag+pol+env' & TEAM%in%c('FT_DEFAULT','FT_PSEUDO','FT_SLOW','FT_SLOWPSEUDO','IQ-TREE','PhyML', 'RAxML'))
tmp[, REP:= gsub('.*_REP([0-9]+)_.*','\\1',FILE)]
tmp <- subset(tmp, TEAM%in%c('IQ-TREE','PhyML', 'RAxML') | (grepl('^FT',TEAM) & REP==1))
set(tmp, tmp[, which(grepl('^FT', TEAM))], 'TEAM', 'FastTree')
#tmp[, list(NQDme=mean(NQDme)), by=c('SC','GENE')]
set(tmp, NULL, 'TEAM', tmp[, factor(TEAM)])
set(tmp, NULL, 'GENE', tmp[, factor(GENE, levels=c('gag','pol','gag+pol+env'))])
ggplot(tmp, aes(x=GAPS)) +
geom_jitter(aes(y=NQDme, pch=TEAM, colour=GENE), position=position_jitter(w=0.4, h = 0), size=2) +
scale_colour_manual(values=c('gag+pol+env'="#3F4788FF")) +
scale_y_continuous(labels = scales::percent, expand=c(0,0), limits=c(0, 0.28), breaks=seq(0,1,0.1), minor_breaks=seq(0,1,0.05)) +
scale_shape_manual(values=c('FastTree'=6, 'IQ-TREE'=15, 'PhyML'=12, 'RAxML'=8, 'MetaPIGA'=17)) +
labs( x='\nUnassembled sites in simulated sequences',
y='proportion among all subtrees with 4 taxa\n',
colour='part of simulated genome\nused for tree reconstruction',
pch='tree reconstruction\nmethod') +
theme_bw() + theme(legend.position='bottom')
file <- file.path(edir, paste(timetag,'_','QD_clumean_by_missingsites_withFastTree.pdf',sep=''))
ggsave(file=file, w=4.5, h=6, useDingbats=FALSE)
}
##--------------------------------------------------------------------------------------------------------
## olli 30.11.16
##--------------------------------------------------------------------------------------------------------
treecomparison.ana.161130.strs<- function()
{
require(ggplot2)
require(data.table)
require(ape)
require(scales)
require(ggtree)
require(phangorn)
#save(strs, strs_rtt, strs_lsd, ttrs, trungps, tinfo, tbrl, tfiles, submitted.info, sclu.info, lba, file=file.path(edir,'submitted_161123_09SBRL.rda'))
edir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/evaluation'
timetag <- '161123'
#
# merge
#
load(file.path(edir,'submitted_160713_09SBRL.rda'))
sa <- copy(submitted.info)
#load(file.path(edir,'submitted_161123_09SBRL.rda'))
load(file.path(edir,'submitted_161123_07MSELSD_noTBRL.rda'))
set(submitted.info, NULL, c('MSE_GD','MAE_GD','MSE_TP_GD','MAE_TP_GD','RUNGGAPS_EXCL'), NULL)
sa <- rbind(sa, submitted.info, use.names=TRUE, fill=TRUE)
submitted.info <- copy(sa)
# fixup RUNGAPS
tmp <- sa[, which(grepl('RUNGAPS',TEAM))]
set(sa, tmp, 'RUNGAPS', sa[tmp, as.numeric(gsub('.*TRAIN[0-9]([0-9][0-9]).*','\\1',regmatches(FILE,regexpr('TRAIN[0-9]+',FILE))))/100])
tmp <- submitted.info[, which('PLEN'==TEAM)]
set(submitted.info, tmp, 'RUNGAPS', 0)
# sa<- copy(submitted.info)
# merge trungps
set(trungps, NULL, 'SC', trungps[, gsub('Regional','REGIONAL',gsub('_P17|_GAG|_FULL','',SC))])
set(trungps, NULL, 'GENE', trungps[, gsub('FULL','GAG+POL+ENV',GENE)])
sa <- merge(sa, subset(trungps, !is.na(RUNGAPS) & !is.na(RUNGAPS_EXCL), c(SC, TEAM, GENE, RUNGAPS, RUNGAPS_EXCL, ACTG_P, UNASS_P)), by=c('SC','TEAM','GENE','RUNGAPS','RUNGAPS_EXCL'), all.x=1)
#
# make pretty
#
set(sa, sa[, which(grepl('gag+pol+env',FILE,fixed=1))], 'GENE', 'GAG+POL+ENV')
sa <- merge(sa, data.table(GENE=c('P17','GAG','GAG+PARTIALPOL','POL','GAG+POL+ENV'), GENE_L=c(396, 1440, 3080, 2843, 6807)), by='GENE')
set(sa, NULL, 'MODEL', sa[, factor(MODEL, levels=c('V','R'),labels=c('Model: Village','Model: Regional'))])
set(sa, sa[, which(SC=="VILL_99_APR15")],'SC',"150701_VILL_SCENARIO-C")
set(sa, NULL, 'SC', sa[, factor(SC, levels=c("150701_REGIONAL_TRAIN1", "150701_REGIONAL_TRAIN2", "150701_REGIONAL_TRAIN3", "150701_REGIONAL_TRAIN4","150701_REGIONAL_TRAIN5","161121_REGIONAL_TRAIN6","150701_VILL_SCENARIO-A","150701_VILL_SCENARIO-B","150701_VILL_SCENARIO-C","150701_VILL_SCENARIO-D","150701_VILL_SCENARIO-E","161121_REGIONAL_GTRFIXED1","161121_REGIONAL_GTRFIXED2","161121_REGIONAL_GTRFIXED3"),
labels=c('sc 1','sc 2','sc 3','sc 4','sc 5','sc 6','sc A','sc B','sc C','sc D','sc E',"161121_REGIONAL_GTRFIXED1","161121_REGIONAL_GTRFIXED2","161121_REGIONAL_GTRFIXED3"))])
set(sa, NULL, 'GAPS', sa[, factor(GAPS, levels=c('none','low','high'),labels=c('none','as for\nBotswana\nsequences','as for\nUganda\nsequences'))])
set(sa, NULL, 'BEST', sa[, factor(BEST, levels=c('Y','N'),labels=c('best tree','replicate tree'))])
set(sa, sa[, which(GENE=='P17')], 'GENE', 'gag (p17)')
set(sa, sa[, which(GENE=='GAG')], 'GENE', 'gag')
set(sa, sa[, which(GENE=='GAG+PARTIALPOL')], 'GENE', 'gag + pol (prot,p51)')
set(sa, sa[, which(GENE=='POL')], 'GENE', 'pol')
set(sa, sa[, which(GENE=='GAG+POL+ENV')], 'GENE', 'gag+pol+env')
set(sa, sa[, which(TEAM=='IQTree')], 'TEAM', 'IQ-TREE')
set(sa, sa[, which(TEAM=='RAXML')], 'TEAM', 'RAxML')
set(sa, NULL, 'EXT', sa[, factor(EXT, levels=c('~0pc','5pc'),labels=c('~ 0%/year','5%/year'))])
set(sa, NULL, 'ACUTE', sa[, factor(ACUTE, levels=c('low','high'),labels=c('10%','40%'))])
set(sa, NULL, 'ART', sa[, factor(ART, levels=c('none','fast'),labels=c('none','fast'))])
sa <- subset(sa, OTHER=='N')
#
# false pos transmission pairs and mean abs error by missing sites
# sparse sampling
#
tmp <- subset(sa, SC=='sc 1' &
((TEAM=='RUNGAPS_ExaML' & grepl('gag+pol+env',GENE,fixed=1)) |
(TEAM=='PLEN')))
tmp[, MISSING_P:= NA_real_]
tmp2 <- tmp[, which(TEAM=='RUNGAPS_ExaML')]
set(tmp, tmp2,'MISSING_P', tmp[tmp2, (RUNGAPS*GENE_L + (6807-GENE_L))/6807])
tmp2 <- tmp[, which(TEAM=='PLEN')]
set(tmp, tmp2,'MISSING_P', tmp[tmp2, 1-PLEN/6807])
set(tmp, NULL, 'TEAM', tmp[, factor(TEAM, levels=c('RUNGAPS_ExaML','PLEN'), labels=c('patchy gag+pol+env sequences','partial sequences'))])
tmp[, Y:=NTPAIR_PHCL_1/(TPAIR_PHCL_1+NTPAIR_PHCL_1)]
ggplot(tmp, aes(x=MISSING_P)) +
geom_point(aes(y=Y, colour=TEAM, pch=SC), size=2) +
scale_colour_manual(values=c('partial sequences'="#35978F",'patchy gag+pol+env sequences'="#3F4788FF")) +
scale_x_continuous(labels = scales::percent, expand=c(0,0), breaks=seq(0,1,0.1)) +
scale_y_continuous(labels = scales::percent, expand=c(0,0), limits=c(0,0.8)) +
scale_shape_manual(values=c('sc 1'=17)) +
labs( x='\nproportion of missing sites, relative to gag+pol+env genome',
y='no transmission pair\n',
colour='distribution of unassembled sites',
pch='sampling coverage') +
theme_bw() + theme(legend.position='bottom')
file <- file.path(edir, paste(timetag,'_','pTransPairAmong1PCDist_by_missingsites.pdf',sep=''))
ggsave(file=file, w=4.5, h=6, useDingbats=FALSE)
#
ggplot(tmp, aes(x=MISSING_P)) +
geom_point(aes(y=MAE_LSD, colour=TEAM, pch=SC), size=2) +
scale_x_continuous(labels = scales::percent, expand=c(0,0), breaks=seq(0,1,0.1)) +
scale_colour_manual(values=c('partial sequences'="#35978F",'patchy gag+pol+env sequences'="#3F4788FF")) +
scale_shape_manual(values=c('sc 1'=17)) +
scale_y_log10(expand=c(0,0), limit=c(1,15), breaks=c(1,1.5,2,3,4,5,10), minor_breaks=c(seq(1,10,1),seq(10,100,10),seq(100,1000,100),seq(1000,10000,1000))) +
labs( x='\nproportion of missing sites, relative to gag+pol+env genome',
y='mean absolute error in dated branches\n(years)\n',
colour='distribution of unassembled sites',
pch='sampling coverage') +
theme_bw() + theme(legend.position='bottom')
file <- file.path(edir, paste(timetag,'_','MAETP_by_missingsites.pdf',sep=''))
ggsave(file=file, w=4.5, h=6, useDingbats=FALSE)
#
ggplot(tmp, aes(x=MISSING_P)) +
geom_point(aes(y=sign(SUM_BRANCHES_T-SUM_BRANCHES)*log10(abs(SUM_BRANCHES_T-SUM_BRANCHES)), colour=TEAM, pch=SC), size=2) +
scale_x_continuous(labels=scales::percent, expand=c(0,0), breaks=seq(0,1,0.1)) +
scale_colour_manual(values=c('partial sequences'="#35978F",'patchy gag+pol+env sequences'="#3F4788FF")) +
scale_shape_manual(values=c('sc 1'=17)) +
scale_y_continuous(breaks=c(-2,1,0,1,2), labels=c(-100,-10,0,10,100)) +
labs( x='\nproportion of missing sites, relative to gag+pol+env genome',
y='sum of branches in true tree-\nsum of branches in reconstructed tree',
colour='distribution of unassembled sites',
pch='sampling coverage') +
theme_bw() + theme(legend.position='bottom')
#
# false pos transmission pairs and mean abs error by missing sites
# dense sampling
#
tmp <- subset(sa, SC=='sc 6' &
((TEAM=='RUNGAPS_ExaML' & grepl('gag+pol+env',GENE,fixed=1)) |
(TEAM=='PLEN')))
tmp[, MISSING_P:= NA_real_]
tmp2 <- tmp[, which(TEAM=='RUNGAPS_ExaML')]
set(tmp, tmp2,'MISSING_P', tmp[tmp2, (RUNGAPS*GENE_L + (6807-GENE_L))/6807])
tmp2 <- tmp[, which(TEAM=='PLEN')]
set(tmp, tmp2,'MISSING_P', tmp[tmp2, 1-PLEN/6807])
set(tmp, NULL, 'TEAM', tmp[, factor(TEAM, levels=c('RUNGAPS_ExaML','PLEN'), labels=c('patchy gag+pol+env sequences','partial sequences'))])
tmp[, Y:=NTPAIR_PHCL_1/(TPAIR_PHCL_1+NTPAIR_PHCL_1)]
ggplot(tmp, aes(x=MISSING_P)) +
geom_point(aes(y=Y, colour=TEAM, pch=SC), size=2) +
scale_colour_manual(values=c('partial sequences'="#35978F",'patchy gag+pol+env sequences'="#3F4788FF")) +
scale_x_continuous(labels = scales::percent, expand=c(0,0), breaks=seq(0,1,0.1)) +
scale_y_continuous(labels = scales::percent, expand=c(0,0), limits=c(0,0.8)) +
scale_shape_manual(values=c('sc 6'=16)) +
labs( x='\nproportion of missing sites, relative to gag+pol+env genome',
y='no transmission pair\n',
colour='distribution of unassembled sites',
pch='sampling coverage') +
theme_bw() + theme(legend.position='bottom')
file <- file.path(edir, paste(timetag,'_','pTransPairAmong1PCDist_by_missingsites_highcoverage.pdf',sep=''))
ggsave(file=file, w=4.5, h=6, useDingbats=FALSE)
#
ggplot(tmp, aes(x=MISSING_P)) +
geom_point(aes(y=MAE_LSD, colour=TEAM, pch=SC), size=2) +
scale_x_continuous(labels = scales::percent, expand=c(0,0), breaks=seq(0,1,0.1)) +
scale_colour_manual(values=c('partial sequences'="#35978F",'patchy gag+pol+env sequences'="#3F4788FF")) +
scale_shape_manual(values=c('sc 6'=16)) +
scale_y_log10(expand=c(0,0), limit=c(1,15), breaks=c(1,1.5,2,3,4,5,10), minor_breaks=c(seq(1,10,1),seq(10,100,10),seq(100,1000,100),seq(1000,10000,1000))) +
labs( x='\nproportion of missing sites, relative to gag+pol+env genome',
y='mean absolute error in dated branches\n(years)\n',
colour='distribution of unassembled sites',
pch='sampling coverage') +
theme_bw() + theme(legend.position='bottom')
file <- file.path(edir, paste(timetag,'_','MAETP_by_missingsites_highcoverage.pdf',sep=''))
ggsave(file=file, w=4.5, h=6, useDingbats=FALSE)
#
# create partial trees
#
tmp <- subset(sa, TEAM=='PLEN')
setkey(tmp, PLEN)
phs <- lapply(tmp[, IDX], function(i) strs_rtt[[i]] )
phs[[length(phs)+1]] <- ttrs[[tmp$SUB_IDX_T[1]]]
phs <- lapply( c(length(phs), seq(1,length(phs)-1)), function(i) phs[[i]] )
class(phs) <- "multiPhylo"
names(phs) <- c('True phylogeny',paste('Simulated phylogeny\nfrom partial sequences of length', tmp$PLEN))
p <- ggtree(phs, size=0.1) + facet_wrap(~.id, ncol=10)
pdf(file=file.path(edir, paste(timetag,'_strs_rtt_plen2.pdf',sep='')), w=40, h=nrow(tmp)/10*12)
print(p)
dev.off()
#
# evaluate excluding alignment columns when x-axis is %unassembled before taxa excluded
#
tmp <- subset(sa, SC=='sc 2' & GENE=='gag+pol+env' & TEAM%in%c('RUNGAPS_EXCLSITE','RUNGAPS_ExaML'))
set(tmp, tmp[, which(TEAM=='RUNGAPS_ExaML')], 'RUNGAPS_EXCL', 1.)
set(tmp, NULL, 'RUNGAPS_EXCL', tmp[, factor(RUNGAPS_EXCL, levels=c(.2,.3,.4,.5, 1.), labels=c(">20% unassembled characters",">30% unassembled characters",">40% unassembled characters",">50% unassembled characters","none"))])
tmp2 <- tmp[, list( RUNGAPS= RUNGAPS,
YM= predict(loess(MAE_LSD~RUNGAPS, span=1.5, degree=2))),
by='RUNGAPS_EXCL']
tmp <- merge(tmp, tmp2, by=c('RUNGAPS_EXCL','RUNGAPS'))
ggplot(subset(tmp, RUNGAPS_EXCL%in%c(">20% unassembled characters",">30% unassembled characters",">40% unassembled characters","none")), aes(x=RUNGAPS)) +
geom_point(data=subset(tmp, RUNGAPS_EXCL%in%c(">20% unassembled characters","none")), aes(y=MAE_LSD, colour=RUNGAPS_EXCL, pch=RUNGAPS_EXCL), size=2, show.legend=FALSE) +
geom_line(aes(y=YM, colour=RUNGAPS_EXCL)) +
scale_colour_brewer(palette='Set1') +
scale_x_continuous(labels=scales::percent, expand=c(0,0), breaks=seq(0,1,0.1), limits=c(0,0.63)) +
scale_y_continuous(expand=c(0,0), breaks=c(1,2,3,5,10,15,20), limits=c(0,20.5)) +
labs( x='\nUnassembled sites in simulated sequences',
y='mean absolute error (years)\n',
colour='sequences excluded\nbefore tree reconstruction',
pch='alignment columns excluded\nbefore tree reconstruction') +
theme_bw() + theme(legend.position='bottom')
file <- file.path(edir, paste(timetag,'_','MAETP_clumean_by_excludedsites.pdf',sep=''))
ggsave(file=file, w=4.5, h=6, useDingbats=FALSE)
#
tmp <- subset(sa, SC=='sc 2' & GENE=='gag+pol+env' & TEAM%in%c('RUNGAPS_EXCLSITE','RUNGAPS_ExaML'))
set(tmp, tmp[, which(TEAM=='RUNGAPS_ExaML')], 'RUNGAPS_EXCL', 1.)
set(tmp, NULL, 'RUNGAPS_EXCL', tmp[, factor(RUNGAPS_EXCL, levels=c(.2,.3,.4,.5, 1.), labels=c(">20% unassembled characters",">30% unassembled characters",">40% unassembled characters",">50% unassembled characters","none"))])
tmp[, Y:=NTPAIR_PHCL_1/(TPAIR_PHCL_1+NTPAIR_PHCL_1)]
tmp <- subset(tmp, !is.na(TPAIR_PHCL_1))
tmp2 <- tmp[, list( RUNGAPS= RUNGAPS,
YM= predict(loess(Y~RUNGAPS, span=5, degree=2))),
by='RUNGAPS_EXCL']
tmp <- merge(tmp, tmp2, by=c('RUNGAPS_EXCL','RUNGAPS'))
ggplot(subset(tmp, RUNGAPS_EXCL%in%c(">20% unassembled characters",">30% unassembled characters",">40% unassembled characters","none")), aes(x=RUNGAPS)) +
geom_point(data=subset(tmp, RUNGAPS_EXCL%in%c(">20% unassembled characters","none")), aes(y=Y, colour=RUNGAPS_EXCL, pch=RUNGAPS_EXCL), size=2, show.legend=FALSE) +
geom_line(aes(y=YM, colour=RUNGAPS_EXCL)) +
scale_colour_brewer(palette='Set1') +
scale_x_continuous(labels=scales::percent, expand=c(0,0), breaks=seq(0,1,0.1), limits=c(0,0.63)) +
scale_y_continuous(labels=scales::percent, expand=c(0,0), limits=c(0,1)) +
labs( x='\nUnassembled sites in simulated sequences',
y='no transmission pair\n',
colour='sequences excluded\nbefore tree reconstruction',
pch='alignment columns excluded\nbefore tree reconstruction') +
theme_bw() + theme(legend.position='bottom')
file <- file.path(edir, paste(timetag,'_','pTransPairAmong1PCDist_clumean_by_excludedsites.pdf',sep=''))
ggsave(file=file, w=4.5, h=6, useDingbats=FALSE)
#
# evaluate excluding taxa when x-axis is %unassembled before taxa excluded
#
tmp <- subset(sa, SC=='sc 2' & GENE=='gag+pol+env' & TEAM%in%c('RUNGAPS_EXCLTAXA','RUNGAPS_ExaML'))
set(tmp, tmp[, which(TEAM=='RUNGAPS_ExaML')], 'RUNGAPS_EXCL', 1.)
set(tmp, NULL, 'RUNGAPS_EXCL', tmp[, factor(RUNGAPS_EXCL, levels=c(0.5,.6,.7,.8,.9,1), labels=c(">50% unassembled sites",">60% unassembled sites",">70% unassembled sites",">80% unassembled sites",">90% unassembled sites","none"))])
tmp2 <- tmp[, list( RUNGAPS= RUNGAPS,
YM= predict(loess(MAE_LSD~RUNGAPS, span=5, degree=2))),
by='RUNGAPS_EXCL']
tmp <- merge(tmp, tmp2, by=c('RUNGAPS_EXCL','RUNGAPS'))
ggplot(subset(tmp, RUNGAPS_EXCL%in%c(">60% unassembled sites",">80% unassembled sites",">90% unassembled sites","none")), aes(x=RUNGAPS)) +
geom_point(data=subset(tmp, RUNGAPS_EXCL%in%c(">80% unassembled sites","none")), aes(y=MAE_LSD, colour=RUNGAPS_EXCL, pch=RUNGAPS_EXCL), size=2, show.legend=FALSE) +
geom_line(aes(y=YM, colour=RUNGAPS_EXCL)) +
scale_colour_brewer(palette='Set1') +
scale_x_continuous(labels=scales::percent, expand=c(0,0), breaks=seq(0,1,0.1), limits=c(0,0.63)) +
scale_y_continuous(expand=c(0,0), breaks=c(1,1.5,2,3,4,5,10), limits=c(0,10.5)) +
labs( x='\nUnassembled sites in simulated sequences',
y='mean absolute error (years)\n',
colour='sequences excluded\nbefore tree reconstruction',
pch='sequences excluded\nbefore tree reconstruction') +
theme_bw() + theme(legend.position='bottom')
file <- file.path(edir, paste(timetag,'_','MAETP_clumean_by_excludedtaxa.pdf',sep=''))
ggsave(file=file, w=4.5, h=6, useDingbats=FALSE)
tmp <- subset(sa, SC=='sc 2' & GENE=='gag+pol+env' & TEAM%in%c('RUNGAPS_EXCLTAXA','RUNGAPS_ExaML'))
set(tmp, tmp[, which(TEAM=='RUNGAPS_ExaML')], 'RUNGAPS_EXCL', 1.)
set(tmp, NULL, 'RUNGAPS_EXCL', tmp[, factor(RUNGAPS_EXCL, levels=c(0.5,.6,.7,.8,.9,1), labels=c(">50% unassembled sites",">60% unassembled sites",">70% unassembled sites",">80% unassembled sites",">90% unassembled sites","none"))])
tmp[, Y:=NTPAIR_PHCL_1/(TPAIR_PHCL_1+NTPAIR_PHCL_1)]
tmp <- subset(tmp, !is.na(TPAIR_PHCL_1))
tmp2 <- tmp[, list( RUNGAPS= RUNGAPS,
YM= predict(loess(Y~RUNGAPS, span=5, degree=1))),
by='RUNGAPS_EXCL']
tmp <- merge(tmp, tmp2, by=c('RUNGAPS_EXCL','RUNGAPS'))
ggplot(subset(tmp, RUNGAPS_EXCL%in%c(">60% unassembled sites",">80% unassembled sites",">90% unassembled sites","none")), aes(x=RUNGAPS)) +
geom_point(data=subset(tmp, RUNGAPS_EXCL%in%c(">80% unassembled sites","none")), aes(y=Y, colour=RUNGAPS_EXCL, pch=RUNGAPS_EXCL), size=2, show.legend=FALSE) +
geom_line(aes(y=YM, colour=RUNGAPS_EXCL)) +
scale_colour_brewer(palette='Set1') +
scale_x_continuous(labels=scales::percent, expand=c(0,0), breaks=seq(0,1,0.1), limits=c(0,0.63)) +
scale_y_continuous(labels=scales::percent, expand=c(0,0)) +
labs( x='\nUnassembled sites in simulated sequences',
y='no transmission pair\n',
colour='sequences excluded\nbefore tree reconstruction',
pch='sequences excluded\nbefore tree reconstruction') +
theme_bw() + theme(legend.position='bottom')
file <- file.path(edir, paste(timetag,'_','pTransPairAmong1PCDist_clumean_by_excludedtaxa.pdf',sep=''))
ggsave(file=file, w=4.5, h=6, useDingbats=FALSE)
#
# evaluate excluding taxa when x-axis is %unassembled after taxa excluded
#
tmp <- subset(sa, GENE=='gag+pol+env' & TEAM%in%c('RUNGAPS2','RUNGAPS_ExaML'))
set(tmp, NULL, 'RUNGAPS_EXCL', tmp[, factor(RUNGAPS_EXCL, levels=c(0.5,.6,.7,.8,.9,1), labels=c(">50% unassembled sites",">60% unassembled sites",">70% unassembled sites",">80% unassembled sites",">90% unassembled sites","none"))])
tmp2 <- tmp[, list( UNASS_P= UNASS_P,
YM= predict(loess(MAE_LSD~UNASS_P, span=5, degree=2))),
by='RUNGAPS_EXCL']
tmp <- merge(tmp, tmp2, by=c('RUNGAPS_EXCL','UNASS_P'))
ggplot(subset(tmp, RUNGAPS_EXCL%in%c(">60% unassembled sites",">80% unassembled sites",">90% unassembled sites","none")), aes(x=UNASS_P)) +
geom_point(data=subset(tmp, RUNGAPS_EXCL%in%c(">80% unassembled sites","none")), aes(y=MAE_LSD, colour=RUNGAPS_EXCL, pch=RUNGAPS_EXCL), size=2, show.legend=FALSE) +
geom_line(aes(y=YM, colour=RUNGAPS_EXCL)) +
scale_colour_brewer(palette='Set1') +
scale_x_continuous(labels=scales::percent, expand=c(0,0), breaks=seq(0,1,0.1), limits=c(0,0.63)) +
scale_y_continuous(expand=c(0,0), breaks=c(1,1.5,2,3,4,5,10), limits=c(0,10.5)) +
labs( x='\nUnassembled sites in simulated sequences',
y='mean absolute error (years)\n',
colour='sequences excluded\nbefore tree reconstruction',
pch='sequences excluded\nbefore tree reconstruction') +
theme_bw() + theme(legend.position='bottom')
file <- file.path(edir, paste(timetag,'_','MAETP_clumean_by_unassembledafterexclusion.pdf',sep=''))
ggsave(file=file, w=4.5, h=6, useDingbats=FALSE)
tmp <- subset(sa, GENE=='gag+pol+env' & TEAM%in%c('RUNGAPS2','RUNGAPS_ExaML'))
set(tmp, NULL, 'RUNGAPS_EXCL', tmp[, factor(RUNGAPS_EXCL, levels=c(0.5,.6,.7,.8,.9,1), labels=c(">50% unassembled sites",">60% unassembled sites",">70% unassembled sites",">80% unassembled sites",">90% unassembled sites","none"))])
tmp[, Y:=NTPAIR_PHCL_1/(TPAIR_PHCL_1+NTPAIR_PHCL_1)]
tmp <- subset(tmp, !is.na(TPAIR_PHCL_1))
tmp2 <- tmp[, list( UNASS_P= UNASS_P,
YM= predict(loess(Y~UNASS_P, span=5, degree=1))),
by='RUNGAPS_EXCL']
tmp <- merge(tmp, tmp2, by=c('RUNGAPS_EXCL','UNASS_P'))
ggplot(subset(tmp, RUNGAPS_EXCL%in%c(">60% unassembled sites",">80% unassembled sites",">90% unassembled sites","none")), aes(x=UNASS_P)) +
geom_point(data=subset(tmp, RUNGAPS_EXCL%in%c(">80% unassembled sites","none")), aes(y=Y, colour=RUNGAPS_EXCL, pch=RUNGAPS_EXCL), size=2, show.legend=FALSE) +
geom_line(aes(y=YM, colour=RUNGAPS_EXCL)) +
scale_colour_brewer(palette='Set1') +
scale_x_continuous(labels=scales::percent, expand=c(0,0), breaks=seq(0,1,0.1), limits=c(0,0.63)) +
scale_y_continuous(labels=scales::percent, expand=c(0,0)) +
labs( x='\nUnassembled sites in simulated sequences',
y='no transmission pair\n',
colour='sequences excluded\nbefore tree reconstruction',
pch='sequences excluded\nbefore tree reconstruction') +
theme_bw() + theme(legend.position='bottom')
file <- file.path(edir, paste(timetag,'_','pTransPairAmong1PCDist_clumean_by_unassembledafterexclusion.pdf',sep=''))
ggsave(file=file, w=4.5, h=6, useDingbats=FALSE)
tmp <- subset(sc, TEAM=='RUNGAPS_ExaML' & ((SC=='sc 1' & GENE=='gag+pol+env') | (SC=='sc 2' & GENE=='gag')))
set(tmp, NULL, 'GENE', tmp[, factor(GENE, levels=c('gag','gag+pol+env'))])
setkey(tmp, GENE, RUNGAPS)
tmp2 <- tmp[, list( RUNGAPS= RUNGAPS,
NQDmeSM= predict(loess(NQDme~RUNGAPS, span=5))),
by='GENE']
tmp <- merge(tmp, tmp2, by=c('GENE','RUNGAPS'))
tmp2 <- merge( rbind(data.table(GENE=c('gag','gag+pol+env'), RUNGAPS=c(0.08, 0.17), LOC='Botswana'), data.table(GENE=c('gag','gag+pol+env'), RUNGAPS=c(0.18, 0.47), LOC='Uganda')), tmp2,by=c('GENE','RUNGAPS'))
ggplot(tmp, aes(x=RUNGAPS)) +
geom_point(aes(y=NQDme, colour=GENE), size=2, pch=16) +
geom_line(aes(y=NQDmeSM, colour=GENE), size=0.5) +
geom_point(data=tmp2, aes(y=NQDmeSM, pch=LOC), size=2.5, fill='black', stroke=1.25) +
scale_colour_manual(values=c('gag'="#FF7F00",'gag+pol+env'="grey50")) +
scale_shape_manual(values=c('Botswana'=2, 'Uganda'=5)) +
scale_x_continuous(labels = scales::percent, expand=c(0,0), limits=c(0, 0.61), breaks=seq(0,1,0.1)) +
scale_y_continuous(labels = scales::percent, expand=c(0,0), limits=c(0, 0.25)) +
#scale_shape_manual(values=c('IQ-TREE'=15, 'PhyML'=12, 'RAxML'=8, 'MetaPIGA'=17)) +
labs( x='\nUnassembled sites in simulated sequences',
y='incorrectly estimated topologies of subtrees with 4 taxa\n(standardized Quartett distance)\n',
colour='part of genome used\nfor tree reconstruction',
pch='sampling location'
) +
theme_bw() + theme(legend.position='bottom')
file <- file.path(edir, paste(timetag,'_','QD_clumean_gagfull_by_rungaps_taxan1600_Acute10pc.pdf',sep=''))
ggsave(file=file, w=4.5, h=6, useDingbats=FALSE)
#
#
#
lba.su <- merge(lba, subset(sa, TEAM=='RUNGAPS_EXCLTAXA' | TEAM=='RUNGAPS_EXCLSITE', c(IDX, TAXAN, RUNGAPS, RUNGAPS_EXCL, OTHER)), by='IDX')
set(lba.su, NULL, 'GAPS', lba.su[, factor(GAPS, levels=c('none','low','high'),labels=c('none','as for\nBotswana\nsequences','as for\nUganda\nsequences'))])
set(lba.su, lba.su[, which(GENE=='P17')], 'GENE', 'gag (p17)')
set(lba.su, lba.su[, which(GENE=='GAG')], 'GENE', 'gag')
set(lba.su, lba.su[, which(GENE=='GAG+PARTIALPOL')], 'GENE', 'gag + pol (prot,p51)')
set(lba.su, lba.su[, which(GENE=='POL')], 'GENE', 'pol')
set(lba.su, lba.su[, which(GENE=='GAG+POL+ENV')], 'GENE', 'gag+pol+env')
set(lba.su, lba.su[, which(TEAM=='IQTree')], 'TEAM', 'IQ-TREE')
set(lba.su, lba.su[, which(TEAM=='RAXML')], 'TEAM', 'RAxML')
lba.su[, ERR:= DEPTH_T-DEPTH]
ref.box <- c(-0.04,0.04)
tmp <- lba.su[, list(TAXAN=length(ERR), OUTLIER_P=mean(ERR<ref.box[1] | ERR>ref.box[2])), by=c('MODEL','SC','TEAM','GAPS','GENE','IDX')]
tmp[, OUTLIER_N:=TAXAN*OUTLIER_P]
set(tmp, NULL, 'TEAM', tmp[, factor(TEAM)])
set(tmp, NULL, 'GENE', tmp[, factor(GENE, levels=c('gag','pol','gag+pol+env'))])
ggplot(tmp, aes(x=GAPS, y=OUTLIER_P, colour=GENE)) +
geom_jitter(position=position_jitter(w=0.8, h = 0), size=1) +
scale_shape_manual(values=c('IQ-TREE'=15, 'PhyML'=12, 'RAxML'=8, 'MetaPIGA'=17)) +
scale_colour_manual(values=c('gag'='red','pol'="grey60", 'gag+pol+env'="#3F4788FF")) +
scale_y_continuous(labels = scales::percent) +
facet_grid(TEAM~GENE, scales='free_y') + theme_bw() + theme(legend.position='bottom') +
labs( x='\nUnassembled sites in simulated sequences',
y='branch length to root\n50% too small or too large\n(% of all taxa in tree)\n')
file <- file.path(edir, paste(timetag,'_','longbranches.pdf',sep=''))
ggsave(file=file, w=10, h=10, useDingbats=FALSE)
}
treecomparison.ana.170424.strs<- function()
{
require(ggplot2)
require(data.table)
require(ape)
require(scales)
require(ggtree)
require(phangorn)
#save(strs, strs_rtt, strs_lsd, ttrs, trungps, tinfo, tbrl, tfiles, submitted.info, sclu.info, lba, file=file.path(edir,'submitted_161123_09SBRL.rda'))
edir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/evaluation'
timetag <- '170424'
#
# merge
#
load(file.path(edir,'submitted_160713_09SBRL.rda'))
sa <- copy(submitted.info)
load(file.path(edir,'submitted_170424_09SBRL.rda'))
sa <- rbind(sa, submitted.info, use.names=TRUE, fill=TRUE)
load(file.path(edir,'submitted_161123_07MSELSD_noTBRL.rda'))
#set(submitted.info, NULL, c('MSE_GD','MAE_GD','MSE_TP_GD','MAE_TP_GD','RUNGGAPS_EXCL'), NULL)
sa <- rbind(sa, submitted.info, use.names=TRUE, fill=TRUE)
submitted.info <- copy(sa)
# fixup RUNGAPS
tmp <- sa[, which(grepl('RUNGAPS',TEAM))]
set(sa, tmp, 'RUNGAPS', sa[tmp, as.numeric(gsub('.*TRAIN[0-9]([0-9][0-9]).*','\\1',regmatches(FILE,regexpr('TRAIN[0-9]+',FILE))))/100])
tmp <- submitted.info[, which('PLEN'==TEAM)]
set(submitted.info, tmp, 'RUNGAPS', 0)
# sa<- copy(submitted.info)
# merge trungps
set(trungps, NULL, 'SC', trungps[, gsub('Regional','REGIONAL',gsub('_P17|_GAG|_FULL','',SC))])
set(trungps, NULL, 'GENE', trungps[, gsub('FULL','GAG+POL+ENV',GENE)])
sa <- merge(sa, subset(trungps, !is.na(RUNGAPS) & !is.na(RUNGAPS_EXCL), c(SC, TEAM, GENE, RUNGAPS, RUNGAPS_EXCL, ACTG_P, UNASS_P)), by=c('SC','TEAM','GENE','RUNGAPS','RUNGAPS_EXCL'), all.x=1)
#
# make pretty
#
set(sa, sa[, which(grepl('gag+pol+env',FILE,fixed=1))], 'GENE', 'GAG+POL+ENV')
sa <- merge(sa, data.table(GENE=c('P17','GAG','GAG+PARTIALPOL','POL','GAG+POL+ENV'), GENE_L=c(396, 1440, 3080, 2843, 6807)), by='GENE')
set(sa, NULL, 'MODEL', sa[, factor(MODEL, levels=c('V','R'),labels=c('Model: Village','Model: Regional'))])
set(sa, sa[, which(SC=="VILL_99_APR15")],'SC',"150701_VILL_SCENARIO-C")
set(sa, NULL, 'SC', sa[, factor(SC, levels=c("150701_REGIONAL_TRAIN1", "150701_REGIONAL_TRAIN2", "150701_REGIONAL_TRAIN3", "150701_REGIONAL_TRAIN4","150701_REGIONAL_TRAIN5","161121_REGIONAL_TRAIN6","150701_VILL_SCENARIO-A","150701_VILL_SCENARIO-B","150701_VILL_SCENARIO-C","150701_VILL_SCENARIO-D","150701_VILL_SCENARIO-E","161121_REGIONAL_GTRFIXED1","161121_REGIONAL_GTRFIXED2","161121_REGIONAL_GTRFIXED3"),
labels=c('sc 1','sc 2','sc 3','sc 4','sc 5','sc 6','sc A','sc B','sc C','sc D','sc E',"161121_REGIONAL_GTRFIXED1","161121_REGIONAL_GTRFIXED2","161121_REGIONAL_GTRFIXED3"))])
set(sa, NULL, 'GAPS', sa[, factor(GAPS, levels=c('none','low','high'),labels=c('none','as for\nBotswana\nsequences','as for\nUganda\nsequences'))])
set(sa, NULL, 'BEST', sa[, factor(BEST, levels=c('Y','N'),labels=c('best tree','replicate tree'))])
set(sa, sa[, which(GENE=='P17')], 'GENE', 'gag (p17)')
set(sa, sa[, which(GENE=='GAG')], 'GENE', 'gag')
set(sa, sa[, which(GENE=='GAG+PARTIALPOL')], 'GENE', 'gag + pol (prot,p51)')
set(sa, sa[, which(GENE=='POL')], 'GENE', 'pol')
set(sa, sa[, which(GENE=='GAG+POL+ENV')], 'GENE', 'gag+pol+env')
set(sa, sa[, which(TEAM=='IQTree')], 'TEAM', 'IQ-TREE')
set(sa, sa[, which(TEAM=='RAXML')], 'TEAM', 'RAxML')
set(sa, NULL, 'EXT', sa[, factor(EXT, levels=c('~0pc','5pc'),labels=c('~ 0%/year','5%/year'))])
set(sa, NULL, 'ACUTE', sa[, factor(ACUTE, levels=c('low','high'),labels=c('10%','40%'))])
set(sa, NULL, 'ART', sa[, factor(ART, levels=c('none','fast'),labels=c('none','fast'))])
sa <- subset(sa, OTHER=='N')
#
# proportion of recovered transmission pairs
#
tmp <- subset(sa, ACUTE=='10%' & GENE=='gag+pol+env' & TEAM%in%c('FT_DEFAULT','FT_PSEUDO','FT_SLOW','FT_SLOWPSEUDO','IQ-TREE', 'PhyML', 'RAxML'), select=c(IDX, GENE, TEAM, GAPS, TPAIR_PHCL_1, NTPAIR_PHCL_1, FILE))
tmp <- subset(tmp, TEAM%in%c('IQ-TREE', 'PhyML', 'RAxML') | (grepl('^FT',TEAM) & grepl('_REP1_',FILE)))
set(tmp, tmp[, which(grepl('^FT', TEAM))], 'TEAM', 'FastTree')
set(tmp, NULL, 'TEAM', tmp[, factor(TEAM)])
set(tmp, NULL, 'GENE', tmp[, factor(GENE, levels=c('gag','pol','gag+pol+env'))])
ggplot(tmp, aes(x=GAPS)) +
geom_jitter(aes(y=NTPAIR_PHCL_1/(TPAIR_PHCL_1+NTPAIR_PHCL_1), colour=GENE, pch=TEAM), position=position_jitter(w=0.3, h = 0), size=2) +
scale_shape_manual(values=c('IQ-TREE'=15, 'PhyML'=12, 'RAxML'=8, 'FastTree'=6)) +
scale_colour_manual(values=c('gag+pol+env'="#3F4788FF")) +
scale_y_continuous(labels = scales::percent, limits=c(0,1), expand=c(0,0)) +
labs( x='\nunassembled sites of PANGEA-HIV sequences',
y='no transmission pair\n',
colour='part of simulated genome\nused for tree reconstruction',
pch='tree reconstruction\nmethod') +
theme_bw() + theme(legend.position='bottom')
file <- file.path(edir, paste(timetag,'_','pTransPairAmong1PCDist_by_gaps_withFastTree.pdf',sep=''))
ggsave(file=file, w=4.5, h=6, useDingbats=FALSE)
#
# MAE of transmission pairs by TEAM
#
tmp <- subset(sa, !is.na(MAE_TP_LSD) & ACUTE=='10%' & GENE=='gag+pol+env' & TEAM%in%c('FT_DEFAULT','FT_PSEUDO','FT_SLOW','FT_SLOWPSEUDO','IQ-TREE', 'PhyML', 'RAxML'), select=c(IDX, GENE, TEAM, GAPS, MAE_TP_LSD, FILE))
tmp <- subset(tmp, TEAM%in%c('IQ-TREE', 'PhyML', 'RAxML') | (grepl('^FT',TEAM) & grepl('_REP1_',FILE)))
set(tmp, tmp[, which(grepl('^FT', TEAM))], 'TEAM', 'FastTree')
set(tmp, NULL, 'TEAM', tmp[, factor(TEAM)])
set(tmp, NULL, 'GENE', tmp[, factor(GENE, levels=c('gag','pol','gag+pol+env'))])
ggplot(subset(tmp, !is.na(MAE_TP_LSD)), aes(x=GAPS)) +
geom_jitter(aes(y=MAE_TP_LSD, colour=GENE, pch=TEAM), position=position_jitter(w=0.3, h = 0), size=2) +
scale_shape_manual(values=c('IQ-TREE'=15, 'PhyML'=12, 'RAxML'=8, 'FastTree'=6)) +
scale_colour_manual(values=c('gag+pol+env'="#3F4788FF")) +
scale_y_log10(expand=c(0,0), limits=c(1,10), breaks=c(1,1.5,2,3,4,5,10)) +
labs( x='\nunassembled sites of PANGEA-HIV sequences',
y='mean absolute error (years)\n',
colour='part of simulated genome\nused for tree reconstruction',
pch='tree reconstruction\nmethod') +
theme_bw() + theme(legend.position='bottom')
file <- file.path(edir, paste(timetag,'_','MAETP_by_gaps_by_TEAM_withFastTree.pdf',sep=''))
ggsave(file=file, w=4.5, h=6, useDingbats=FALSE)
#
#
#
tmp <- subset(sa, ACUTE=='10%' & GENE=='gag+pol+env' & TEAM%in%c('FT_DEFAULT','FT_PSEUDO','FT_SLOW','FT_SLOWPSEUDO','IQ-TREE', 'PhyML', 'RAxML'))
tmp <- subset(tmp, TEAM%in%c('IQ-TREE', 'PhyML', 'RAxML') | (grepl('^FT',TEAM) & grepl('_REP1_',FILE)))
set(tmp, tmp[, which(grepl('^FT', TEAM))], 'TEAM', 'FastTree')
set(tmp, NULL, 'TEAM', tmp[, factor(TEAM)])
set(tmp, NULL, 'GENE', tmp[, factor(GENE, levels=c('gag','pol','gag+pol+env'))])
tmp <- melt(tmp, measure.vars=c('TPAIR_PHCL_1','NTPAIR_PHCL_1'))
set(tmp, tmp[, which(variable=='TPAIR_PHCL_1')], 'variable', 'Yes')
set(tmp, tmp[, which(variable=='NTPAIR_PHCL_1')], 'variable', 'No')
ggplot(tmp, aes(x=factor(IDX), fill=variable)) +
geom_bar(aes(y=value), stat='identity', position='stack') +
facet_wrap(~TEAM+GAPS, ncol=3, scales='free') +
scale_fill_manual(values=c('Yes'='red','No'="grey60")) +
labs( x='\nreplicate tree reconstructions',
y='phylogenetic pairs < 1% subst/site\n',
fill='true transmission pair\nin simulation') +
theme_bw() + theme(legend.position='bottom', axis.text.x=element_blank(), axis.ticks.x=element_blank())
file <- file.path(edir, paste(timetag,'_','pTransPairAmong1PCDist_by_gaps_long_withFastTree.pdf',sep=''))
ggsave(file=file, w=8, h=12, useDingbats=FALSE)
#
# sum branches
#
tmp <- subset(sa, ACUTE=='10%' & GENE=='gag+pol+env' & TEAM%in%c('PhyML','MetaPIGA','IQ-TREE', 'RAxML','FT_DEFAULT','FT_PSEUDO','FT_SLOW','FT_SLOWPSEUDO'), select=c(IDX, GENE, TEAM, GAPS, SUM_BRANCHES_T, SUM_BRANCHES, FILE))
tmp <- subset(tmp, TEAM%in%c('IQ-TREE', 'PhyML', 'RAxML') | (grepl('^FT',TEAM) & grepl('_REP1_',FILE)))
set(tmp, tmp[, which(grepl('^FT', TEAM))], 'TEAM', 'FastTree')
set(tmp, NULL, 'TEAM', tmp[, factor(TEAM)])
set(tmp, NULL, 'GENE', tmp[, factor(GENE, levels=c('gag','pol','gag+pol+env'))])
ggplot(tmp, aes(x=GAPS)) +
geom_jitter(aes(y=sign(SUM_BRANCHES_T-SUM_BRANCHES)*log10(abs(SUM_BRANCHES_T-SUM_BRANCHES)), colour=GENE, pch=TEAM), position=position_jitter(w=0.3, h = 0), size=2) +
scale_shape_manual(values=c('IQ-TREE'=15, 'PhyML'=12, 'RAxML'=8, 'FastTree'=6)) +
scale_colour_manual(values=c('gag'='red','pol'="grey60", 'gag+pol+env'="#3F4788FF")) +
scale_y_continuous(labels=c(-100,-10,0,10,100), limits=c(-log10(300),log10(300)), expand=c(0,0), breaks=seq(-2,2,1)) +
labs( x='\nunassembled sites of PANGEA-HIV sequences',
y='sum of branches in true tree -\nsum of branches in reconstructed tree',
colour='part of simulated genome\nused for tree reconstruction',
pch='tree reconstruction\nmethod') +
theme_bw() + theme(legend.position='bottom')
file <- file.path(edir, paste(timetag,'_','sumBranches_by_gaps.pdf',sep=''))
ggsave(file=file, w=4.5, h=6, useDingbats=FALSE)
#
# plot simulated FastTree trees versus true tree
#
load(file.path(edir,'submitted_170424_09SBRL.rda'))
require(ggtree)
tmp <- subset(submitted.info, TEAM%in%c('FT_DEFAULT','FT_PSEUDO','FT_SLOW','FT_SLOWPSEUDO') & grepl('_REP1_',FILE))
invisible(tmp[,
{
#IDX <- c(531,532,533,534,535,536,537,538,539,540,748,749,750,751,752,753,754,755,756,757,870)
#TEAM <- c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0 , 0 ,0 ,0 ,0 )
#GENE <- rep('POL', length(IDX))
tmp <- lapply(IDX, function(i) strs_rtt[[i]] )
if(MODEL[1]!='R')
tmp[[length(tmp)+1]] <- ttrs[[TIME_IDX_T[1]]]
if(MODEL[1]=='R')
tmp[[length(tmp)+1]] <- ttrs[[SUB_IDX_T[1]]]
tmp <- lapply( c(length(tmp), seq(1,length(tmp)-1)), function(i) tmp[[i]] )
class(tmp) <- "multiPhylo"
print(c('Simulated phylogeny',paste(TEAM, GENE, IDX, sep='-')))
names(tmp) <- c('Simulated phylogeny',paste(TEAM, GENE, IDX, sep='-'))
p <- ggtree(tmp, size=0.1) + theme_tree2() + facet_wrap(~.id, ncol=10, scales='free_x')
pdf(file=file.path(edir, paste(timetag,'_strs_rtt_',SC,'.pdf',sep='')), w=40, h=15)
print(p)
dev.off()
NULL
}, by=c('SC')])
}
##--------------------------------------------------------------------------------------------------------
## olli 27.06.11
##--------------------------------------------------------------------------------------------------------
treecomparison.ana.160627.strs<- function()
{
require(ggplot2)
require(data.table)
require(ape)
require(scales)
require(ggtree)
require(phangorn)
edir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/evaluation'
#timetag <- '160627'
timetag <- '160713'
load(file.path(edir,'submitted_160713_09SBRL.rda'))
set(submitted.info, submitted.info[, which(grepl('gag+pol+env',FILE,fixed=1))], 'GENE', 'GAG+POL+ENV')
sa <- copy(submitted.info)
sa <- merge(sa, data.table(GENE=c('P17','GAG','GAG+PARTIALPOL','POL','GAG+POL+ENV'), GENE_L=c(396, 1440, 3080, 2843, 6807)), by='GENE')
set(sa, NULL, 'MODEL', sa[, factor(MODEL, levels=c('V','R'),labels=c('Model: Village','Model: Regional'))])
set(sa, sa[, which(SC=="VILL_99_APR15")],'SC',"150701_VILL_SCENARIO-C")
set(sa, NULL, 'SC', sa[, factor(SC, levels=c("150701_REGIONAL_TRAIN1", "150701_REGIONAL_TRAIN2", "150701_REGIONAL_TRAIN3", "150701_REGIONAL_TRAIN4","150701_REGIONAL_TRAIN5","150701_VILL_SCENARIO-A","150701_VILL_SCENARIO-B","150701_VILL_SCENARIO-C","150701_VILL_SCENARIO-D","150701_VILL_SCENARIO-E"),
labels=c('sc 1','sc 2','sc 3','sc 4','sc 5','sc A','sc B','sc C','sc D','sc E'))])
set(sa, NULL, 'GAPS', sa[, factor(GAPS, levels=c('none','low','high'),labels=c('none','as for\nBotswana\nsequences','as for\nUganda\nsequences'))])
set(sa, NULL, 'BEST', sa[, factor(BEST, levels=c('Y','N'),labels=c('best tree','replicate tree'))])
set(sa, sa[, which(GENE=='P17')], 'GENE', 'gag (p17)')
set(sa, sa[, which(GENE=='GAG')], 'GENE', 'gag')
set(sa, sa[, which(GENE=='GAG+PARTIALPOL')], 'GENE', 'gag + pol (prot,p51)')
set(sa, sa[, which(GENE=='POL')], 'GENE', 'pol')
set(sa, sa[, which(GENE=='GAG+POL+ENV')], 'GENE', 'gag+pol+env')
set(sa, sa[, which(TEAM=='IQTree')], 'TEAM', 'IQ-TREE')
set(sa, sa[, which(TEAM=='RAXML')], 'TEAM', 'RAxML')
set(sa, NULL, 'EXT', sa[, factor(EXT, levels=c('~0pc','5pc'),labels=c('~ 0%/year','5%/year'))])
set(sa, NULL, 'ACUTE', sa[, factor(ACUTE, levels=c('low','high'),labels=c('10%','40%'))])
set(sa, NULL, 'ART', sa[, factor(ART, levels=c('none','fast'),labels=c('none','fast'))])
sa <- subset(sa, OTHER=='N')
#
# on full tree
#
#
# sum of branch lengths on full genome by method
#
tmp <- subset(sa, ACUTE=='10%' & GENE=='gag+pol+env' & TEAM%in%c('PhyML','MetaPIGA','IQ-TREE', 'RAxML'), select=c(IDX, GENE, TEAM, GAPS, SUM_BRANCHES_T, SUM_BRANCHES))
set(tmp, NULL, 'TEAM', tmp[, factor(TEAM)])
set(tmp, NULL, 'GENE', tmp[, factor(GENE, levels=c('gag','pol','gag+pol+env'))])
ggplot(tmp, aes(x=GAPS)) +
geom_jitter(aes(y=sign(SUM_BRANCHES_T-SUM_BRANCHES)*log10(abs(SUM_BRANCHES_T-SUM_BRANCHES)), colour=GENE, pch=TEAM), position=position_jitter(w=0.8, h = 0), size=2) +
scale_shape_manual(values=c('IQ-TREE'=15, 'PhyML'=12, 'RAxML'=8, 'MetaPIGA'=17)) +
scale_colour_manual(values=c('gag'='red','pol'="grey60", 'gag+pol+env'="#3F4788FF")) +
scale_y_continuous(labels=c(-100,-10,0,10,100), limits=c(-log10(300),log10(300)), expand=c(0,0), breaks=seq(-2,2,1)) +
labs( x='\nunassembled sites of PANGEA-HIV sequences',
y='sum of branches in true tree -\nsum of branches in reconstructed tree',
colour='part of simulated genome\nused for tree reconstruction',
pch='tree reconstruction\nmethod') +
theme_bw() + theme(legend.position='bottom')
file <- file.path(edir, paste(timetag,'_','sumBranches_by_gaps.pdf',sep=''))
ggsave(file=file, w=4.5, h=6, useDingbats=FALSE)
#
# true transmission pairs among closest with distance < 1%
#
tmp <- subset(sa, ACUTE=='10%' & GENE=='gag+pol+env' & TEAM%in%c('PhyML','MetaPIGA','IQ-TREE', 'RAxML'), select=c(IDX, GENE, TEAM, GAPS, TPAIR_PHCL_1, NTPAIR_PHCL_1))
set(tmp, NULL, 'TEAM', tmp[, factor(TEAM)])
set(tmp, NULL, 'GENE', tmp[, factor(GENE, levels=c('gag','pol','gag+pol+env'))])
ggplot(tmp, aes(x=GAPS)) +
geom_jitter(aes(y=NTPAIR_PHCL_1/(TPAIR_PHCL_1+NTPAIR_PHCL_1), colour=GENE, pch=TEAM), position=position_jitter(w=0.8, h = 0), size=2) +
scale_shape_manual(values=c('IQ-TREE'=15, 'PhyML'=12, 'RAxML'=8, 'MetaPIGA'=17)) +
scale_colour_manual(values=c('gag'='red','pol'="grey60", 'gag+pol+env'="#3F4788FF")) +
scale_y_continuous(labels = scales::percent, limits=c(0,1), expand=c(0,0)) +
labs( x='\nunassembled sites of PANGEA-HIV sequences',
y='no transmission pair\n',
colour='part of simulated genome\nused for tree reconstruction',
pch='tree reconstruction\nmethod') +
theme_bw() + theme(legend.position='bottom')
file <- file.path(edir, paste(timetag,'_','pTransPairAmong1PCDist_by_gaps.pdf',sep=''))
ggsave(file=file, w=4.5, h=6, useDingbats=FALSE)
#
# long
#
tmp <- subset(sa, ACUTE=='10%' & GENE=='gag+pol+env' & TEAM%in%c('PhyML', 'MetaPIGA','IQ-TREE', 'RAxML'), select=c(IDX, GENE, TEAM, GAPS, TPAIR_PHCL_1, NTPAIR_PHCL_1))
set(tmp, NULL, 'TEAM', tmp[, factor(TEAM)])
set(tmp, NULL, 'GENE', tmp[, factor(GENE, levels=c('gag','pol','gag+pol+env'))])
tmp <- melt(tmp, measure.vars=c('TPAIR_PHCL_1','NTPAIR_PHCL_1'))
set(tmp, tmp[, which(variable=='TPAIR_PHCL_1')], 'variable', 'Yes')
set(tmp, tmp[, which(variable=='NTPAIR_PHCL_1')], 'variable', 'No')
ggplot(tmp, aes(x=factor(IDX), fill=variable)) +
geom_bar(aes(y=value), stat='identity', position='stack') +
facet_wrap(~TEAM+GAPS, ncol=3, scales='free') +
scale_fill_manual(values=c('Yes'='red','No'="grey60")) +
labs( x='\nreplicate tree reconstructions',
y='phylogenetic pairs < 1% subst/site\n',
fill='true transmission pair\nin simulation') +
theme_bw() + theme(legend.position='bottom', axis.text.x=element_blank(), axis.ticks.x=element_blank())
file <- file.path(edir, paste(timetag,'_','pTransPairAmong1PCDist_by_gaps_long.pdf',sep=''))
ggsave(file=file, w=8, h=12, useDingbats=FALSE)
#
# by missing sites
#
tmp <- subset(sa, TEAM=='RUNGAPS_ExaML' & !grepl('p51', GENE))
tmp <- subset(tmp, GENE=='gag' & RUNGAPS==0.02 | GENE=='gag (p17)' & RUNGAPS==0.02 | GENE=='gag+pol+env')
tmp[, MISSING_P:= (RUNGAPS*GENE_L + (6807-GENE_L))/6807]
set(tmp, NULL, 'GENE', tmp[, factor(GENE, levels=c('gag (p17)','gag','gag+pol+env'))])
setkey(tmp, GENE, RUNGAPS)
ggplot(tmp, aes(x=MISSING_P)) +
geom_point(aes(y=NTPAIR_PHCL_1/(TPAIR_PHCL_1+NTPAIR_PHCL_1), colour=GENE), size=2, pch=16) +
#geom_line(aes(y=1-TR_PAIR_recSM, colour=GENE), size=0.5) +
#geom_point(data=tmp2, aes(y=1-TR_PAIR_recSM, pch=LOC), size=2.5, fill='black') +
scale_colour_manual(values=c('gag (p17)'="#8C510A", 'gag + pol (prot,p51)'='green','gag'='red','gag+pol+env'="#3F4788FF")) +
scale_shape_manual(values=c('Botswana'=23, 'Uganda'=24)) +
scale_x_continuous(labels = scales::percent, expand=c(0,0), breaks=seq(0,1,0.1), limits=c(0,1)) +
scale_y_continuous(labels = scales::percent, expand=c(0,0), limits=c(0,1)) +
#scale_shape_manual(values=c('IQ-TREE'=15, 'PhyML'=12, 'RAxML'=8, 'MetaPIGA'=17)) +
labs( x='\nproportion of missing sites, relative to gag+pol+env genome',
y='proportion of sampled transmission pairs\n',
colour='part of simulated genome\nused for tree reconstruction',
pch='sampling location') +
theme_bw() + theme(legend.position='bottom')
file <- file.path(edir, paste(timetag,'_','pTransPairAmong1PCDist_by_missingsites.pdf',sep=''))
ggsave(file=file, w=4.5, h=6, useDingbats=FALSE)
#
# by rungaps
#
tmp <- subset(sa, TEAM=='RUNGAPS_ExaML' & !grepl('p51', GENE))
set(tmp, NULL, 'GENE', tmp[, factor(GENE, levels=c('gag (p17)','gag','gag+pol+env'))])
setkey(tmp, GENE, RUNGAPS)
tmp[, Y:=NTPAIR_PHCL_1/(TPAIR_PHCL_1+NTPAIR_PHCL_1)]
tmp2 <- tmp[, list( RUNGAPS= RUNGAPS,
YM= predict(loess(Y~RUNGAPS, span=5, degree=1))),
by='GENE']
tmp <- merge(tmp, tmp2, by=c('GENE','RUNGAPS'))
tmp2 <- merge( rbind(data.table(GENE=c('gag (p17)','gag','gag + pol (prot,p51)','gag+pol+env'), RUNGAPS=c(0.11, 0.08, 0.14, 0.17), LOC='Botswana'), data.table(GENE=c('gag (p17)','gag','gag + pol (prot,p51)','gag+pol+env'), RUNGAPS=c(0.21, 0.18, 0.34, 0.47), LOC='Uganda')), tmp2,by=c('GENE','RUNGAPS'))
#set(tmp, NULL, 'GENE', tmp[, factor(GENE, levels=c('gag (p17)','gag','gag + pol (prot,p51)','gag+pol+env'))])
ggplot(tmp, aes(x=RUNGAPS)) +
geom_point(aes(y=Y, colour=GENE), size=2, pch=8) +
geom_line(aes(y=YM, colour=GENE), size=0.5) +
geom_point(data=tmp2, aes(y=YM, pch=LOC), size=2.5, fill='black') +
scale_colour_manual(values=c('gag (p17)'="#8C510A", 'gag + pol (prot,p51)'='green','gag'='red','gag+pol+env'="#3F4788FF")) +
scale_shape_manual(values=c('Botswana'=23, 'Uganda'=24)) +
scale_x_continuous(labels = scales::percent, expand=c(0,0), breaks=seq(0,1,0.1)) +
scale_y_continuous(labels = scales::percent, expand=c(0,0), limits=c(0,0.8)) +
#scale_shape_manual(values=c('IQ-TREE'=15, 'PhyML'=12, 'RAxML'=8, 'MetaPIGA'=17)) +
labs( x='\nUnassembled sites in simulated sequences',
y='no transmission pair\n',
colour='part of genome used\nfor tree reconstruction',
pch='sampling location') +
theme_bw() + theme(legend.position='bottom')
file <- file.path(edir, paste(timetag,'_','pTransPairAmong1PCDist_by_rungaps.pdf',sep=''))
ggsave(file=file, w=5, h=7, useDingbats=FALSE)
#
# proportion of recovered transmission pairs
#
tmp <- subset(sa, !is.na(TR_PAIR_rec) & ACUTE=='10%' & GENE=='gag+pol+env' & TEAM%in%c('IQ-TREE', 'PhyML', 'RAxML', 'MetaPIGA'))
set(tmp, NULL, 'TEAM', tmp[, factor(TEAM)])
set(tmp, NULL, 'GENE', tmp[, factor(GENE, levels=c('gag','pol','gag+pol+env'))])
ggplot(tmp, aes(x=GAPS)) +
geom_jitter(aes(y=1-TR_PAIR_rec, colour=GENE, pch=TEAM), position=position_jitter(w=0.8, h = 0), size=2) +
scale_shape_manual(values=c('IQ-TREE'=15, 'PhyML'=12, 'RAxML'=8, 'MetaPIGA'=17)) +
scale_colour_manual(values=c('gag'='red','pol'="grey60", 'gag+pol+env'="#3F4788FF")) +
scale_y_continuous(labels = scales::percent, limit=c(0,0.25)) +
labs( x='\nunassembled sites of PANGEA-HIV sequences',
y='proportion of sampled transmission pairs\n',
colour='part of simulated genome used\nfor tree reconstruction',
pch='tree reconstruction method') +
theme_bw() + theme(legend.position='bottom')
file <- file.path(edir, paste(timetag,'_','pTransPairRecovered_by_gaps.pdf',sep=''))
ggsave(file=file, w=5, h=7, useDingbats=FALSE)
#
# proportion of recovered transmission pairs RAxML
#
tmp <- subset(sa, !is.na(TR_PAIR_rec) & ACUTE=='10%' & TEAM=='RAxML')
tmp[, list(TR_PAIR_nrec_pc= mean(1-TR_PAIR_rec) ), by=c('SC','GENE')]
#
# proportion of recovered transmission pairs IQTree
#
tmp <- subset(sa, !is.na(TR_PAIR_rec) & ACUTE=='10%' & TEAM=='IQ-TREE')
set(tmp, NULL, 'TEAM', tmp[, factor(TEAM)])
set(tmp, NULL, 'GENE', tmp[, factor(GENE, levels=c('gag','pol','gag+pol+env'))])
ggplot(tmp, aes(x=GAPS)) +
geom_jitter(aes(y=1-TR_PAIR_rec, colour=GENE), position=position_jitter(w=0.8, h = 0), size=2) +
scale_shape_manual(values=c('IQ-TREE'=15, 'PhyML'=12, 'RAxML'=8, 'MetaPIGA'=17)) +
scale_colour_manual(values=c('gag'='red','pol'="grey60", 'gag+pol+env'="#3F4788FF")) +
scale_y_continuous(labels = scales::percent, limit=c(-0.01,0.47), expand=c(0,0), minor_breaks=seq(0,1,0.05), breaks=seq(0,1,0.1)) +
labs( x='\nUnassembled sites in full-genome sequences',
y='phylogenetically closest pairs of individuals\nthat are transmission pairs, out of all such pairs\nthat can be identified in the true tree\n',
colour='part of genome used\nfor tree reconstruction',
pch='algorithm') +
theme_bw() + theme(legend.position='bottom') + facet_grid(~TEAM)
file <- file.path(edir, paste(timetag,'_','pTransPairRecovered_by_gaps_IQTree.pdf',sep=''))
ggsave(file=file, w=4.5, h=6, useDingbats=FALSE)
#
# proportion of recovered transmission pairs by TEAM
#
ggplot(tmp, aes(x=GAPS)) +
geom_jitter(aes(y=1-TR_PAIR_rec, colour=GENE), position=position_jitter(w=0.8, h = 0), size=2) +
scale_shape_manual(values=c('IQ-TREE'=15, 'PhyML'=12, 'RAxML'=8, 'MetaPIGA'=17)) +
scale_colour_manual(values=c('gag'='red','pol'="grey60", 'gag+pol+env'="#3F4788FF")) +
scale_y_continuous(labels = scales::percent, limit=c(-0.01,0.47), expand=c(0,0), minor_breaks=seq(0,1,0.05), breaks=seq(0,1,0.1)) +
labs( x='\nUnassembled sites in full-genome sequences',
y='phylogenetically closest pairs of individuals\nthat are transmission pairs, out of all such pairs\nthat can be identified in the true tree\n',
colour='part of genome used\nfor tree reconstruction',
pch='algorithm') +
theme_bw() + theme(legend.position='bottom') + facet_grid(~TEAM)
file <- file.path(edir, paste(timetag,'_','pTransPairRecovered_by_gaps_by_TEAM.pdf',sep=''))
ggsave(file=file, w=12, h=5, useDingbats=FALSE)
#
# proportion of recovered transmission pairs by TEAM
#
tmp <- subset(sa, !is.na(TR_PAIR_rec) & ACUTE=='10%' & TEAM%in%c('IQ-TREE', 'PhyML', 'RAxML', 'MetaPIGA','BioNJ','MVR'))
set(tmp, NULL, 'TEAM', tmp[, factor(TEAM)])
set(tmp, NULL, 'GENE', tmp[, factor(GENE, levels=c('gag','pol','gag+pol+env'))])
ggplot(tmp, aes(x=GAPS)) +
geom_jitter(aes(y=TR_PAIR_rec, colour=GENE, pch=TEAM), position=position_jitter(w=0.8, h = 0), size=2) +
scale_shape_manual(values=c('IQ-TREE'=15, 'PhyML'=12, 'RAxML'=8, 'MetaPIGA'=17,'MVR'=7,'BioNJ'=9)) +
scale_colour_manual(values=c('gag'='red','pol'="grey60", 'gag+pol+env'="#3F4788FF")) +
scale_y_continuous(labels = scales::percent, limit=c(0,1)) +
labs( x='\nUnassembled sites in full-genome sequences',
y='phylogenetically closest pairs of individuals\nthat are transmission pairs, out of all such pairs\nthat can be identified in the true tree\n',
colour='part of genome used\nfor tree reconstruction',
pch='algorithm') +
theme_bw() + theme(legend.position='bottom') + facet_grid(~TEAM)
file <- file.path(edir, paste(timetag,'_','pTransPairRecovered_by_gaps_by_TEAM_withBioNJMVR.pdf',sep=''))
ggsave(file=file, w=15, h=6, useDingbats=FALSE)
#
# proportion of recovered transmission pairs by rungaps -- overall missing sites
#
# confirm proportion unassembled
if(0)
{
dre <- data.table(FA=list.files('~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/running_gaps_simulations', pattern='fa$', full.names=TRUE))
dre[, GENE:= sapply(strsplit(basename(FA),'_'),'[[',4)]
dre[, RUNGAPS:= sapply(strsplit(basename(FA),'_'),'[[',3)]
set(dre, NULL, 'RUNGAPS', dre[, as.numeric(gsub('TRAIN2','',RUNGAPS))/100])
set(dre, NULL, 'GENE', dre[, as.character(factor(GENE, levels=c('P17','GAG','GAGPP', 'FULL'), labels=c('gag (p17)', 'gag', 'gag + pol (prot,p51)','gag+pol+env')))])
dre <- dre[, {
sq <- read.dna(FA, format='fa')
z <- apply(as.character(sq),1,function(x) length(which(x=='?'))) / ncol(sq)
list(RUNGAPS_E=mean(z))
}, by=c('FA','GENE','RUNGAPS')]
dre[, FA:=NULL]
tmp <- subset(sa, TEAM=='RUNGAPS_ExaML' & !grepl('p51', GENE))
tmp <- merge(dre, tmp, by=c('GENE','RUNGAPS'))
tmp[, MISSING_P_E:= (RUNGAPS_E*GENE_L + (6807-GENE_L))/6807]
tmp[, MISSING_P:= (RUNGAPS*GENE_L + (6807-GENE_L))/6807]
}
tmp <- subset(sa, TEAM=='RUNGAPS_ExaML' & !grepl('p51', GENE))
tmp <- subset(tmp, GENE=='gag' & RUNGAPS==0.02 | GENE=='gag (p17)' & RUNGAPS==0.02 | GENE=='gag+pol+env')
tmp[, MISSING_P:= (RUNGAPS*GENE_L + (6807-GENE_L))/6807]
set(tmp, NULL, 'GENE', tmp[, factor(GENE, levels=c('gag (p17)','gag','gag+pol+env'))])
setkey(tmp, GENE, RUNGAPS)
ggplot(tmp, aes(x=MISSING_P)) +
geom_point(aes(y=1-TR_PAIR_rec, colour=GENE), size=2, pch=16) +
#geom_line(aes(y=1-TR_PAIR_recSM, colour=GENE), size=0.5) +
#geom_point(data=tmp2, aes(y=1-TR_PAIR_recSM, pch=LOC), size=2.5, fill='black') +
scale_colour_manual(values=c('gag (p17)'="#8C510A", 'gag + pol (prot,p51)'='green','gag'='red','gag+pol+env'="#3F4788FF")) +
scale_shape_manual(values=c('Botswana'=23, 'Uganda'=24)) +
scale_x_continuous(labels = scales::percent, expand=c(0,0), breaks=seq(0,1,0.1), limits=c(0,1)) +
scale_y_continuous(labels = scales::percent, expand=c(0,0), limits=c(0,0.15)) +
#scale_shape_manual(values=c('IQ-TREE'=15, 'PhyML'=12, 'RAxML'=8, 'MetaPIGA'=17)) +
labs( x='\nproportion of missing sites, relative to gag+pol+env genome',
y='proportion of sampled transmission pairs\n',
colour='part of simulated genome\nused for tree reconstruction',
pch='sampling location') +
theme_bw() + theme(legend.position='bottom')
file <- file.path(edir, paste(timetag,'_','pTransPairRecovered_p17full_by_missingsites_taxan1600_Acute10pc.pdf',sep=''))
ggsave(file=file, w=5, h=7, useDingbats=FALSE)
#
# proportion of recovered transmission pairs by rungaps
#
tmp <- subset(sa, TEAM=='RUNGAPS_ExaML' & !grepl('p51', GENE))
set(tmp, NULL, 'GENE', tmp[, factor(GENE, levels=c('gag (p17)','gag','gag+pol+env'))])
setkey(tmp, GENE, RUNGAPS)
tmp2 <- tmp[, list( RUNGAPS= RUNGAPS,
TR_PAIR_recSM= predict(loess(TR_PAIR_rec~RUNGAPS, span=5))),
by='GENE']
tmp <- merge(tmp, tmp2, by=c('GENE','RUNGAPS'))
tmp2 <- merge( rbind(data.table(GENE=c('gag (p17)','gag','gag + pol (prot,p51)','gag+pol+env'), RUNGAPS=c(0.11, 0.08, 0.14, 0.17), LOC='Botswana'), data.table(GENE=c('gag (p17)','gag','gag + pol (prot,p51)','gag+pol+env'), RUNGAPS=c(0.21, 0.18, 0.34, 0.47), LOC='Uganda')), tmp2,by=c('GENE','RUNGAPS'))
#set(tmp, NULL, 'GENE', tmp[, factor(GENE, levels=c('gag (p17)','gag','gag + pol (prot,p51)','gag+pol+env'))])
ggplot(tmp, aes(x=RUNGAPS)) +
geom_point(aes(y=1-TR_PAIR_rec, colour=GENE), size=2, pch=8) +
geom_line(aes(y=1-TR_PAIR_recSM, colour=GENE), size=0.5) +
geom_point(data=tmp2, aes(y=1-TR_PAIR_recSM, pch=LOC), size=2.5, fill='black') +
scale_colour_manual(values=c('gag (p17)'="#8C510A", 'gag + pol (prot,p51)'='green','gag'='red','gag+pol+env'="#3F4788FF")) +
scale_shape_manual(values=c('Botswana'=23, 'Uganda'=24)) +
scale_x_continuous(labels = scales::percent, expand=c(0,0), breaks=seq(0,1,0.1)) +
scale_y_continuous(labels = scales::percent, expand=c(0,0)) +
#scale_shape_manual(values=c('IQ-TREE'=15, 'PhyML'=12, 'RAxML'=8, 'MetaPIGA'=17)) +
labs( x='\nUnassembled sites in simulated sequences',
y='phylogenetically closest pairs of individuals\nthat are transmission pairs, out of all such pairs\nthat can be identified in the true tree\n',
colour='part of genome used\nfor tree reconstruction',
pch='sampling location') +
theme_bw() + theme(legend.position='bottom')
file <- file.path(edir, paste(timetag,'_','pTransPairRecovered_p17full_by_rungaps_taxan1600_Acute10pc.pdf',sep=''))
ggsave(file=file, w=5, h=7, useDingbats=FALSE)
#
# long branches on regional
#
lba.su <- merge(lba, subset(sa, select=c(IDX, TAXAN, RUNGAPS, OTHER)), by='IDX')
set(lba.su, NULL, 'GAPS', lba.su[, factor(GAPS, levels=c('none','low','high'),labels=c('none','as for\nBotswana\nsequences','as for\nUganda\nsequences'))])
set(lba.su, lba.su[, which(GENE=='P17')], 'GENE', 'gag (p17)')
set(lba.su, lba.su[, which(GENE=='GAG')], 'GENE', 'gag')
set(lba.su, lba.su[, which(GENE=='GAG+PARTIALPOL')], 'GENE', 'gag + pol (prot,p51)')
set(lba.su, lba.su[, which(GENE=='POL')], 'GENE', 'pol')
set(lba.su, lba.su[, which(GENE=='GAG+POL+ENV')], 'GENE', 'gag+pol+env')
set(lba.su, lba.su[, which(TEAM=='IQTree')], 'TEAM', 'IQ-TREE')
set(lba.su, lba.su[, which(TEAM=='RAXML')], 'TEAM', 'RAxML')
# get reference of error without long branches
lba.su[, ERR:= DEPTH_T-DEPTH]
# these are the closest trees in terms of NRF
#ref.box <- subset(lba.su, IDX==858)[, quantile(ERR, p=c(0.25, 0.75))+c(-1,1)*3*diff(quantile(ERR, p=c(0.25, 0.75)))]
#ref.box <- subset(lba.su, IDX==2)[, quantile(ERR, p=c(0.25, 0.75))+c(-1,1)*3*diff(quantile(ERR, p=c(0.25, 0.75)))]
#subset(lba.su, IDX==858)[, sd(ERR)*10]
# this suggests the following Tukey criterion:
#ref.box <- c(-0.04,0.04)
ref.box <- c(-0.1,0.1)
#
# severe branch lengths errors by team
#
tmp <- subset(lba.su, OTHER=='N' & TEAM!='RUNGAPS_ExaML')[, list(TAXAN=length(ERR), OUTLIER_P=mean(ERR<ref.box[1] | ERR>ref.box[2])), by=c('MODEL','SC','TEAM','GAPS','GENE','IDX')]
tmp[, OUTLIER_N:=TAXAN*OUTLIER_P]
set(tmp, NULL, 'TEAM', tmp[, factor(TEAM)])
set(tmp, NULL, 'GENE', tmp[, factor(GENE, levels=c('gag','pol','gag+pol+env'))])
ggplot(tmp, aes(x=GAPS, y=OUTLIER_P, colour=GENE)) +
geom_jitter(position=position_jitter(w=0.8, h = 0), size=1) +
scale_shape_manual(values=c('IQ-TREE'=15, 'PhyML'=12, 'RAxML'=8, 'MetaPIGA'=17)) +
scale_colour_manual(values=c('gag'='red','pol'="grey60", 'gag+pol+env'="#3F4788FF")) +
scale_y_continuous(labels = scales::percent) +
facet_grid(TEAM~GENE, scales='free_y') + theme_bw() + theme(legend.position='bottom') +
labs( x='\nUnassembled sites in simulated sequences',
y='branch length to root\n50% too small or too large\n(% of all taxa in tree)\n')
file <- file.path(edir, paste(timetag,'_','longbranches.pdf',sep=''))
ggsave(file=file, w=10, h=10, useDingbats=FALSE)
#
tmp <- subset(lba.su, OTHER=='N' & TEAM!='RUNGAPS_ExaML')[, list(TAXAN=length(ERR), OUTLIER_P=mean(ERR< -0.04 | ERR> 0.04)), by=c('MODEL','SC','TEAM','GAPS','GENE','IDX')]
tmp[, OUTLIER_N:=TAXAN*OUTLIER_P]
set(tmp, NULL, 'TEAM', tmp[, factor(TEAM)])
set(tmp, NULL, 'GENE', tmp[, factor(GENE, levels=c('gag','pol','gag+pol+env'))])
ggplot(tmp, aes(x=GAPS, y=OUTLIER_P, colour=GENE)) +
geom_jitter(position=position_jitter(w=0.8, h = 0), size=1) +
scale_shape_manual(values=c('IQ-TREE'=15, 'PhyML'=12, 'RAxML'=8, 'MetaPIGA'=17)) +
scale_colour_manual(values=c('gag'='red','pol'="grey60", 'gag+pol+env'="#3F4788FF")) +
scale_y_continuous(labels = scales::percent) +
facet_grid(TEAM~GENE, scales='free_y') + theme_bw() + theme(legend.position='bottom') +
labs( x='\nUnassembled sites in simulated sequences',
y='branch length to root\n20% too small or too large\n(% of all taxa in tree)\n')
file <- file.path(edir, paste(timetag,'_','longbranches20pc.pdf',sep=''))
ggsave(file=file, w=10, h=10, useDingbats=FALSE)
#
# severe branch lengths errors by rungaps
#
tmp <- subset(lba.su, TEAM=='RUNGAPS_ExaML' & !grepl('p51', GENE))
tmp <- tmp[, list(TAXAN=length(ERR), OUTLIER_P=mean(ERR< -0.04 | ERR>0.04)), by=c('MODEL','SC','TEAM','GAPS','GENE','RUNGAPS','IDX')]
tmp[, OUTLIER_N:=TAXAN*OUTLIER_P]
set(tmp, NULL, 'GENE', tmp[, factor(GENE, levels=c('gag (p17)','gag','gag+pol+env'))])
setkey(tmp, GENE, RUNGAPS)
tmp2 <- tmp[, list( RUNGAPS= RUNGAPS,
OUTLIER_P_SM= predict(loess(OUTLIER_P~RUNGAPS, span=5))),
by='GENE']
tmp <- merge(tmp, tmp2, by=c('GENE','RUNGAPS'))
tmp2 <- merge( rbind(data.table(GENE=c('gag (p17)','gag','gag + pol (prot,p51)','gag+pol+env'), RUNGAPS=c(0.11, 0.08, 0.14, 0.17), LOC='Botswana'), data.table(GENE=c('gag (p17)','gag','gag + pol (prot,p51)','gag+pol+env'), RUNGAPS=c(0.21, 0.18, 0.34, 0.47), LOC='Uganda')), tmp2,by=c('GENE','RUNGAPS'))
#set(tmp, NULL, 'GENE', tmp[, factor(GENE, levels=c('gag (p17)','gag','gag + pol (prot,p51)','gag+pol+env'))])
ggplot(tmp, aes(x=RUNGAPS)) +
geom_point(aes(y=OUTLIER_P, colour=GENE), size=2, pch=8) +
#geom_line(aes(y=OUTLIER_P_SM, colour=GENE), size=0.5) +
#geom_point(data=tmp2, aes(y=OUTLIER_P_SM, pch=LOC), size=2.5, fill='black') +
scale_colour_manual(values=c('gag (p17)'="#8C510A", 'gag + pol (prot,p51)'='green','gag'='red','gag+pol+env'="#3F4788FF")) +
scale_shape_manual(values=c('Botswana'=23, 'Uganda'=24)) +
scale_x_continuous(labels = scales::percent, expand=c(0,0), breaks=seq(0,1,0.1)) +
scale_y_continuous(labels = scales::percent) +
#scale_shape_manual(values=c('IQ-TREE'=15, 'PhyML'=12, 'RAxML'=8, 'MetaPIGA'=17)) +
labs( x='\nUnassembled sites in simulated sequences',
y='branch length to root\n20% too small or too large\n(% of all taxa in tree)\n',
colour='part of genome used\nfor tree reconstruction',
pch='sampling location') +
theme_bw() + theme(legend.position='bottom')
file <- file.path(edir, paste(timetag,'_','longbranches_p17full_by_rungaps_taxan1600_Acute10pc.pdf',sep=''))
ggsave(file=file, w=5, h=7, useDingbats=FALSE)
#
# MSE by TEAM
#
tmp <- subset(sa, !is.na(TR_PAIR_rec) & ACUTE=='10%' & TEAM%in%c('IQ-TREE', 'PhyML', 'MetaPIGA', 'RAxML'))
set(tmp, NULL, 'TEAM', tmp[, factor(TEAM)])
set(tmp, NULL, 'GENE', tmp[, factor(GENE, levels=c('gag','pol','gag+pol+env'))])
ggplot(subset(tmp, !is.na(MSE_LSD)), aes(x=GAPS)) +
geom_jitter(aes(y=sqrt(MSE_LSD), colour=GENE, pch=TEAM), position=position_jitter(w=0.8, h = 0), size=2) +
scale_shape_manual(values=c('IQ-TREE'=15, 'PhyML'=12, 'RAxML'=8, 'MetaPIGA'=17)) +
scale_colour_manual(values=c('gag'='red','pol'="grey60", 'gag+pol+env'="#3F4788FF")) +
scale_y_log10(limit=c(1,1e4)) +
labs( x='\nUnassembled sites in full-genome sequences',
y='root mean squared error\nin dated branches\n(years)\n',
colour='part of genome used\nfor tree reconstruction',
pch='algorithm') +
theme_bw() + theme(legend.position='bottom') + facet_grid(~TEAM)
file <- file.path(edir, paste(timetag,'_','RMSE_by_gaps_by_TEAM.pdf',sep=''))
ggsave(file=file, w=12, h=6, useDingbats=FALSE)
#
# MAE overall by TEAM (similar)
#
tmp <- subset(sa, !is.na(TR_PAIR_rec) & ACUTE=='10%' & TEAM%in%c('IQ-TREE', 'PhyML', 'MetaPIGA', 'RAxML'))
set(tmp, NULL, 'TEAM', tmp[, factor(TEAM)])
set(tmp, NULL, 'GENE', tmp[, factor(GENE, levels=c('gag','pol','gag+pol+env'))])
ggplot(subset(tmp, !is.na(MAE_LSD)), aes(x=GAPS)) +
geom_jitter(aes(y=MAE_LSD, colour=GENE, pch=TEAM), position=position_jitter(w=0.8, h = 0), size=2) +
scale_shape_manual(values=c('IQ-TREE'=15, 'PhyML'=12, 'RAxML'=8, 'MetaPIGA'=17)) +
scale_colour_manual(values=c('gag'='red','pol'="grey60", 'gag+pol+env'="#3F4788FF")) +
scale_y_log10(expand=c(0,0), limit=c(1,1e4), breaks=c(1,10,100,1000,1e4), minor_breaks=c(seq(1,10,1),seq(10,100,10),seq(100,1000,100),seq(1000,10000,1000))) +
labs( x='\nUnassembled sites in full-genome sequences',
y='mean absolute error in dated branches\n(years)\n',
colour='part of genome used\nfor tree reconstruction',
pch='algorithm') +
theme_bw() + theme(legend.position='bottom') + facet_grid(~TEAM)
file <- file.path(edir, paste(timetag,'_','MAE_by_gaps_by_TEAM.pdf',sep=''))
ggsave(file=file, w=12, h=6, useDingbats=FALSE)
#
# MAE overall pairs by by rungaps
#
tmp <- subset(sa, TEAM=='RUNGAPS_ExaML' & !grepl('p51', GENE))
set(tmp, NULL, 'GENE', tmp[, factor(GENE, levels=c('gag (p17)','gag','gag+pol+env'))])
setkey(tmp, GENE, RUNGAPS)
tmp2 <- tmp[, list( RUNGAPS= RUNGAPS,
MAE_LSD_SM= predict(loess(MAE_LSD~RUNGAPS, degree=2, span=20))),
by='GENE']
tmp <- merge(tmp, tmp2, by=c('GENE','RUNGAPS'))
tmp2 <- merge( rbind(data.table(GENE=c('gag (p17)','gag','gag + pol (prot,p51)','gag+pol+env'), RUNGAPS=c(0.11, 0.08, 0.14, 0.17), LOC='Botswana'), data.table(GENE=c('gag (p17)','gag','gag + pol (prot,p51)','gag+pol+env'), RUNGAPS=c(0.21, 0.18, 0.34, 0.47), LOC='Uganda')), tmp2,by=c('GENE','RUNGAPS'))
#set(tmp, NULL, 'GENE', tmp[, factor(GENE, levels=c('gag (p17)','gag','gag + pol (prot,p51)','gag+pol+env'))])
ggplot(tmp, aes(x=RUNGAPS)) +
geom_point(aes(y=MAE_LSD, colour=GENE), size=2, pch=8) +
geom_line(aes(y=MAE_LSD_SM, colour=GENE), size=0.5) +
geom_point(data=tmp2, aes(y=MAE_LSD_SM, pch=LOC), size=2.5, fill='black') +
scale_colour_manual(values=c('gag (p17)'="#8C510A", 'gag + pol (prot,p51)'='green','gag'='red','gag+pol+env'="#3F4788FF")) +
scale_shape_manual(values=c('Botswana'=23, 'Uganda'=24)) +
scale_x_continuous(labels = scales::percent, expand=c(0,0), breaks=seq(0,1,0.1)) +
scale_y_continuous(expand=c(0,0), limit=c(0,20), breaks=seq(0,20,5), minor_breaks=seq(0,10,1)) +
#scale_shape_manual(values=c('IQ-TREE'=15, 'PhyML'=12, 'RAxML'=8, 'MetaPIGA'=17)) +
labs( x='\nUnassembled sites in simulated sequences',
y='mean absolute error in dated branches\n(years)\n',
colour='part of genome used\nfor tree reconstruction',
pch='sampling location') +
theme_bw() + theme(legend.position='bottom')
file <- file.path(edir, paste(timetag,'_','MAE_p17full_by_rungaps_taxan1600_Acute10pc.pdf',sep=''))
ggsave(file=file, w=5, h=7, useDingbats=FALSE)
#
# MAE of transmission pairs by RAxML
#
tmp <- subset(sa, !is.na(MAE_TP_LSD) & ACUTE=='10%' & TEAM=='RAxML')
tmp[, list(MAE=mean(MAE_TP_LSD)), by=c('SC','GENE')]
#
# MAE of transmission pairs by TEAM
#
tmp <- subset(sa, !is.na(MAE_TP_LSD) & ACUTE=='10%' & GENE=='gag+pol+env' & TEAM%in%c('IQ-TREE', 'PhyML', 'MetaPIGA', 'RAxML'))
set(tmp, NULL, 'TEAM', tmp[, factor(TEAM)])
set(tmp, NULL, 'GENE', tmp[, factor(GENE, levels=c('gag','pol','gag+pol+env'))])
ggplot(subset(tmp, !is.na(MAE_TP_LSD)), aes(x=GAPS)) +
geom_jitter(aes(y=MAE_TP_LSD, colour=GENE, pch=TEAM), position=position_jitter(w=0.8, h = 0), size=2) +
scale_shape_manual(values=c('IQ-TREE'=15, 'PhyML'=12, 'RAxML'=8, 'MetaPIGA'=17)) +
scale_colour_manual(values=c('gag'='red','pol'="grey60", 'gag+pol+env'="#3F4788FF")) +
scale_y_log10(expand=c(0,0), limits=c(1,10), breaks=c(1,1.5,2,3,4,5,10)) +
labs( x='\nunassembled sites of PANGEA-HIV sequences',
y='mean absolute error (years)\n',
colour='part of simulated genome\nused for tree reconstruction',
pch='tree reconstruction\nmethod') +
theme_bw() + theme(legend.position='bottom')
file <- file.path(edir, paste(timetag,'_','MAETP_by_gaps_by_TEAM.pdf',sep=''))
ggsave(file=file, w=4.5, h=6, useDingbats=FALSE)
#
# MAE_LSD of transmission pairs IQTree
#
tmp <- subset(sa, !is.na(MAE_TP_LSD) & ACUTE=='10%' & TEAM=='IQ-TREE')
set(tmp, NULL, 'TEAM', tmp[, factor(TEAM)])
set(tmp, NULL, 'GENE', tmp[, factor(GENE, levels=c('gag','pol','gag+pol+env'))])
ggplot(subset(tmp, !is.na(MAE_TP_LSD)), aes(x=GAPS)) +
geom_jitter(aes(y=MAE_TP_LSD, colour=GENE), position=position_jitter(w=0.8, h = 0), size=2) +
scale_shape_manual(values=c('IQ-TREE'=15, 'PhyML'=12, 'RAxML'=8, 'MetaPIGA'=17)) +
scale_colour_manual(values=c('gag'='red','pol'="grey60", 'gag+pol+env'="#3F4788FF")) +
scale_y_log10(expand=c(0,0), limit=c(1,32), breaks=c(1,10,100,1000), minor_breaks=c(seq(1,10,1),seq(10,100,10),seq(100,1000,100),seq(1000,10000,1000))) +
labs( x='\nUnassembled sites in full-genome sequences',
y='mean absolute error in dated branches\namong sampled transmission pairs\n(years)\n',
colour='part of genome used\nfor tree reconstruction',
pch='algorithm') +
theme_bw() + theme(legend.position='bottom') + facet_grid(~TEAM)
file <- file.path(edir, paste(timetag,'_','MAETP_by_gaps_IQTree.pdf',sep=''))
ggsave(file=file, w=4.5, h=6, useDingbats=FALSE)
#
# MAE of transmission pairs IQTree
#
tmp <- subset(sa, !is.na(MAE_TP_LSD) & ACUTE=='10%' & TEAM=='IQ-TREE')
set(tmp, NULL, 'TEAM', tmp[, factor(TEAM)])
set(tmp, NULL, 'GENE', tmp[, factor(GENE, levels=c('gag','pol','gag+pol+env'))])
ggplot(subset(tmp, !is.na(MAE_TP)), aes(x=GAPS)) +
geom_jitter(aes(y=MAE_TP, colour=GENE), position=position_jitter(w=0.8, h = 0), size=2) +
scale_shape_manual(values=c('IQ-TREE'=15, 'PhyML'=12, 'RAxML'=8, 'MetaPIGA'=17)) +
scale_colour_manual(values=c('gag'='red','pol'="grey60", 'gag+pol+env'="#3F4788FF")) +
#scale_y_log10(expand=c(0,0), limit=c(1,32), breaks=c(1,10,100,1000), minor_breaks=c(seq(1,10,1),seq(10,100,10),seq(100,1000,100),seq(1000,10000,1000))) +
labs( x='\nUnassembled sites in full-genome sequences',
y='mean absolute error in dated branches\namong sampled transmission pairs\n(years)\n',
colour='part of genome used\nfor tree reconstruction',
pch='algorithm') +
theme_bw() + theme(legend.position='bottom') + facet_grid(~TEAM)
file <- file.path(edir, paste(timetag,'_','MAETP_by_gaps_IQTree.pdf',sep=''))
ggsave(file=file, w=4.5, h=6, useDingbats=FALSE)
#
# MAE of transmission pairs by by rungaps
#
tmp <- subset(sa, TEAM=='RUNGAPS_ExaML' & !grepl('p51', GENE))
tmp <- subset(tmp, GENE=='gag' & RUNGAPS==0.02 | GENE=='gag (p17)' & RUNGAPS==0.02 | GENE=='gag+pol+env')
tmp[, MISSING_P:= (RUNGAPS*GENE_L + (6807-GENE_L))/6807]
set(tmp, NULL, 'GENE', tmp[, factor(GENE, levels=c('gag (p17)','gag','gag+pol+env'))])
#set(tmp, NULL, 'GENE', tmp[, factor(GENE, levels=c('gag (p17)','gag','gag + pol (prot,p51)','gag+pol+env'))])
ggplot(tmp, aes(x=MISSING_P)) +
geom_point(aes(y=MAE_TP_LSD, colour=GENE), size=2, pch=16) +
#geom_line(aes(y=MAE_TP_LSD_SM, colour=GENE), size=0.5) +
#geom_point(data=tmp2, aes(y=MAE_TP_LSD_SM, pch=LOC), size=2.5, fill='black') +
scale_colour_manual(values=c('gag (p17)'="#8C510A", 'gag + pol (prot,p51)'='green','gag'='red','gag+pol+env'="#3F4788FF")) +
scale_shape_manual(values=c('Botswana'=23, 'Uganda'=24)) +
scale_x_continuous(labels = scales::percent, expand=c(0,0), breaks=seq(0,1,0.1), limits=c(0,1)) +
scale_y_continuous(expand=c(0,0), limit=c(0,6), breaks=seq(0,10,1), minor_breaks=seq(0,10,.5)) +
#scale_shape_manual(values=c('IQ-TREE'=15, 'PhyML'=12, 'RAxML'=8, 'MetaPIGA'=17)) +
labs( x='\nProportion of missing sites relative to gag+pol+env genome',
y='mean absolute error (years)\n',
colour='part of simulated genome used\nfor tree reconstruction') +
theme_bw() + theme(legend.position='bottom')
file <- file.path(edir, paste(timetag,'_','MAETP_p17full_by_missingsites_taxan1600_Acute10pc.pdf',sep=''))
ggsave(file=file, w=5, h=7, useDingbats=FALSE)
#
#
tmp <- subset(sa, TEAM=='RUNGAPS_ExaML' & !grepl('p51', GENE))
set(tmp, NULL, 'GENE', tmp[, factor(GENE, levels=c('gag (p17)','gag','gag+pol+env'))])
setkey(tmp, GENE, RUNGAPS)
tmp2 <- tmp[, list( RUNGAPS= RUNGAPS,
MAE_TP_LSD_SM= predict(loess(MAE_TP_LSD~RUNGAPS, degree=2, span=5))),
by='GENE']
tmp <- merge(tmp, tmp2, by=c('GENE','RUNGAPS'))
tmp2 <- merge( rbind(data.table(GENE=c('gag (p17)','gag','gag + pol (prot,p51)','gag+pol+env'), RUNGAPS=c(0.11, 0.08, 0.14, 0.17), LOC='Botswana'), data.table(GENE=c('gag (p17)','gag','gag + pol (prot,p51)','gag+pol+env'), RUNGAPS=c(0.21, 0.18, 0.34, 0.47), LOC='Uganda')), tmp2,by=c('GENE','RUNGAPS'))
#set(tmp, NULL, 'GENE', tmp[, factor(GENE, levels=c('gag (p17)','gag','gag + pol (prot,p51)','gag+pol+env'))])
ggplot(tmp, aes(x=RUNGAPS)) +
geom_point(aes(y=MAE_TP_LSD, colour=GENE), size=2, pch=8) +
geom_line(aes(y=MAE_TP_LSD_SM, colour=GENE), size=0.5) +
geom_point(data=tmp2, aes(y=MAE_TP_LSD_SM, pch=LOC), size=2.5, fill='black') +
scale_colour_manual(values=c('gag (p17)'="#8C510A", 'gag + pol (prot,p51)'='green','gag'='red','gag+pol+env'="#3F4788FF")) +
scale_shape_manual(values=c('Botswana'=23, 'Uganda'=24)) +
scale_x_continuous(labels = scales::percent, expand=c(0,0), breaks=seq(0,1,0.1)) +
scale_y_continuous(expand=c(0,0), limit=c(0,7), breaks=seq(0,10,2), minor_breaks=seq(0,10,.5)) +
#scale_shape_manual(values=c('IQ-TREE'=15, 'PhyML'=12, 'RAxML'=8, 'MetaPIGA'=17)) +
labs( x='\nUnassembled sites in simulated sequences',
y='mean absolute error in dated branches\namong sampled transmission pairs\n(years)\n',
colour='part of genome used\nfor tree reconstruction',
pch='sampling location') +
theme_bw() + theme(legend.position='bottom')
file <- file.path(edir, paste(timetag,'_','MAETP_p17full_by_rungaps_taxan1600_Acute10pc.pdf',sep=''))
ggsave(file=file, w=5, h=7, useDingbats=FALSE)
#
#
# plot simulated trees versus true tree
#
require(ggtree)
setkey(sa, SC, TEAM, GENE, RUNGAPS)
invisible(subset(sa, TEAM!='RUNGAPS_ExaML')[,
{
#IDX <- c(531,532,533,534,535,536,537,538,539,540,748,749,750,751,752,753,754,755,756,757,870)
#TEAM <- c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0 , 0 ,0 ,0 ,0 )
#GENE <- rep('POL', length(IDX))
tmp <- lapply(IDX, function(i) strs_rtt[[i]] )
if(MODEL[1]!='R')
tmp[[length(tmp)+1]] <- ttrs[[TIME_IDX_T[1]]]
if(MODEL[1]=='R')
tmp[[length(tmp)+1]] <- ttrs[[SUB_IDX_T[1]]]
tmp <- lapply( c(length(tmp), seq(1,length(tmp)-1)), function(i) tmp[[i]] )
class(tmp) <- "multiPhylo"
print(c('Simulated phylogeny',paste(TEAM, GENE, IDX, sep='-')))
names(tmp) <- c('Simulated phylogeny',paste(TEAM, GENE, IDX, sep='-'))
p <- ggtree(tmp, size=0.1) + facet_wrap(~.id, ncol=10, scales='free_x')
pdf(file=file.path(edir, paste(timetag,'_strs_rtt_',SC,'.pdf',sep='')), w=40, h=length(IDX)/10*12)
print(p)
dev.off()
NULL
}, by=c('SC')])
invisible(subset(sa, TEAM=='RUNGAPS_ExaML')[,
{
#IDX <- c(531,532,533,534,535,536,537,538,539,540,748,749,750,751,752,753,754,755,756,757,870)
#TEAM <- c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0 , 0 ,0 ,0 ,0 )
#GENE <- rep('POL', length(IDX))
tmp <- lapply(IDX, function(i) strs_rtt[[i]] )
tmp[[length(tmp)+1]] <- ttrs[[SUB_IDX_T[1]]]
tmp <- lapply( c(length(tmp), seq(1,length(tmp)-1)), function(i) tmp[[i]] )
class(tmp) <- "multiPhylo"
print(c('Simulated phylogeny',paste(TEAM, GENE, IDX, RUNGAPS, sep='-')))
names(tmp) <- c('Simulated phylogeny',paste(GENE, IDX, RUNGAPS, sep='-'))
p <- ggtree(tmp, size=0.1) + facet_wrap(~.id, ncol=10)
pdf(file=file.path(edir, paste(timetag,'_strs_rtt_rungaps_',GENE,'.pdf',sep='')), w=40, h=length(IDX)/10*12)
print(p)
dev.off()
NULL
}, by=c('GENE')])
#
# plot trees
#
tmpdir <- '~/duke/tmp'
sc_id <- 'sc 4'
gene_id <- 'gag'
team_id <- 'PhyML'
for(sc_id in c("sc 1","sc 2","sc 4"))
{
for(team_id in c("PhyML"))
{
for(gene_id in c('gag','gag+pol+env'))
{
#gene_id<- 'POL'
tmp <- subset(sa, TEAM==team_id & SC==sc_id & GENE==gene_id)
#if(team_id=="MetaPIGA")
# tmp <- subset(tmp, grepl("best solution_use",FILE))
phr <- ttrs[[ tmp[1, SUB_IDX_T] ]]
phs <- lapply(tmp[, IDX], function(i) strs_rtt[[i]] )
# drop to common tips
z <- setdiff(phr$tip.label, phs[[1]]$tip.label)
#stopifnot( length(z)==abs(diff(c(TAXAN, TAXAN_T))) )
if(length(z))
phr <- drop.tip(phr, z, rooted=TRUE, root.edge=1)
phr <- multi2di(phr,random =FALSE)
pho <- treedist.get.tree.100bs(phr, phs, tmpdir)
ggtree(pho, aes(color=bootstrap), size=0.2) +
scale_colour_continuous(low="grey70", high="#1B0C42FF", guide="none") +
#scale_colour_continuous(low="#1B0C42FF", high="#D64B40FF", guide="none") +
theme_tree2(legend.position='right') + theme(axis.line.x=element_line()) + labs(x='average substions per site', axis.title=element_text(size=2))
file <- file.path(edir, paste(timetag, '_AgreeTree100bs_', sc_id, '_', team_id, '_', gene_id, 'std.pdf', sep=''))
ggsave(file=file, w=6, h=35)
ggtree(pho, layout="fan", aes(color=bootstrap), size=0.2) +
scale_colour_continuous(low="grey70", high="#1B0C42FF", guide="none") +
#scale_colour_continuous(low="#1B0C42FF", high="#D64B40FF", guide="none") +
theme_tree2(legend.position='right') + theme(axis.text=element_blank())
file <- file.path(edir, paste(timetag, '_AgreeTree100bs_', gsub(' ','-',sc_id), '_', team_id, '_', gene_id, 'circular.pdf', sep=''))
ggsave(file=file, w=6, h=6)
}
}
}
}
##--------------------------------------------------------------------------------------------------------
## olli 27.06.11
##--------------------------------------------------------------------------------------------------------
treecomparison.ana.160627<- function()
{
require(ggplot2)
require(data.table)
require(ape)
require(scales)
require(ggtree)
require(phangorn)
edir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/evaluation'
#timetag <- '160627'
timetag <- '160713'
load(paste(edir,'/','submitted_160713_RFPDQDTP.rda',sep=''))
#
#
#
sa <- copy(submitted.info)
#
set(sa, NULL, 'MODEL', sa[, factor(MODEL, levels=c('V','R'),labels=c('Model: Village','Model: Regional'))])
set(sa, sa[, which(SC=="VILL_99_APR15")],'SC',"150701_VILL_SCENARIO-C")
set(sa, NULL, 'SC', sa[, factor(SC, levels=c("150701_REGIONAL_TRAIN1", "150701_REGIONAL_TRAIN2", "150701_REGIONAL_TRAIN3", "150701_REGIONAL_TRAIN4","150701_REGIONAL_TRAIN5","150701_VILL_SCENARIO-A","150701_VILL_SCENARIO-B","150701_VILL_SCENARIO-C","150701_VILL_SCENARIO-D","150701_VILL_SCENARIO-E"),
labels=c('sc 1','sc 2','sc 3','sc 4','sc 5','sc A','sc B','sc C','sc D','sc E'))])
set(sa, NULL, 'GAPS', sa[, factor(GAPS, levels=c('none','low','high'),labels=c('none','as for Botswana\nsequences','as for Uganda\nsequences'))])
set(sa, NULL, 'BEST', sa[, factor(BEST, levels=c('Y','N'),labels=c('best tree','replicate tree'))])
set(sa, NULL, 'GENE', sa[, factor(GENE, levels=c('GAG','POL','GAG+POL+ENV'),labels=c('gag','pol','gag+pol+env'))])
set(sa, NULL, 'TEAM', sa[, factor(TEAM, levels=sa[, sort(unique(TEAM))],labels=sa[, sort(unique(TEAM))])])
set(sa, NULL, 'EXT', sa[, factor(EXT, levels=c('~0pc','5pc'),labels=c('~ 0%/year','5%/year'))])
set(sa, NULL, 'ACUTE', sa[, factor(ACUTE, levels=c('low','high'),labels=c('10%','40%'))])
set(sa, NULL, 'ART', sa[, factor(ART, levels=c('none','fast'),labels=c('none','fast'))])
sa <- subset(sa, OTHER=='N')
#
# on full tree
#
# prob closest on 4.5% (confounded by branch lengths)
#tmp <- merge(sa, subset(lba.su, OUTLIER_P<0.2, IDX), by='IDX')
tmp <- subset(sa, !is.na(TR_REC_perc_45) & GENE=='gag+pol+env' & ACUTE=='10%' & TEAM%in%c('RAxML','IQ-TREE'))
ggplot(tmp, aes(x=GAPS)) +
geom_jitter(aes(y=TR_REC_perc_45, colour=GENE, pch=TEAM), position=position_jitter(w=0.8, h = 0), size=2) +
scale_shape_manual(values=c('IQTree'=15, 'PhyML'=12, 'RAXML'=8, 'MetaPIGA'=17)) +
geom_point(aes(y=TR_REC_perc_T_45), colour="black", size=2) +
scale_colour_manual(values=c('gag'='red','pol'="grey60", 'gag+pol+env'="#3F4788FF")) +
scale_y_continuous(labels = scales::percent, limit=c(0,0.3), expand=c(0,0)) +
labs( x='\nGappiness of full-genome sequences',
y='phylogenetically closest pairs of individual\nthat are true transmission pairs\n',
colour='part of genome used\nfor tree reconstruction',
pch='algorithm') +
theme_bw() + theme(legend.position='bottom')
file <- file.path(edir, paste(timetag,'_','pTransRec45_by_gaps.pdf',sep=''))
ggsave(file=file, w=5, h=7)
# prob closest w/o GD criterion
tmp <- subset(sa, !is.na(TR_REC_perc_Inf) & ACUTE=='10%' & TEAM%in%c('RAXML','IQTree'))
ggplot(tmp, aes(x=GAPS)) +
geom_jitter(aes(y=TR_REC_perc_Inf, colour=GENE, pch=TEAM), position=position_jitter(w=0.8, h = 0), size=2) +
scale_shape_manual(values=c('IQTree'=15, 'PhyML'=12, 'RAXML'=8, 'MetaPIGA'=17)) +
geom_point(aes(y=TR_REC_perc_T_Inf), colour="black", size=2) +
scale_colour_manual(values=c('gag'='red','pol'="grey60", 'gag+pol+env'="#3F4788FF")) +
scale_y_continuous(labels = scales::percent, limit=c(0,0.1), expand=c(0,0)) +
labs( x='\nGappiness of full-genome sequences',
y='phylogenetically closest pairs of individuals\nthat are true transmission pairs\n',
colour='part of genome used\nfor tree reconstruction',
pch='algorithm') +
theme_bw() + theme(legend.position='bottom')
file <- file.path(edir, paste(timetag,'_','pTransRecInf_by_gaps.pdf',sep=''))
ggsave(file=file, w=5, h=7)
# instead, proportion of recovered transmission pairs?
# get list of correct pairs in true phylogeny. how many of these do we see in reconstructed phylogeny?
tmp <- subset(sa, !is.na(TR_PAIR_rec) & ACUTE=='10%')
ggplot(tmp, aes(x=GAPS)) +
geom_jitter(aes(y=TR_PAIR_rec, colour=GENE, pch=TEAM), position=position_jitter(w=0.8, h = 0), size=2) +
scale_shape_manual(values=c('IQTree'=15, 'PhyML'=12, 'RAXML'=8, 'MetaPIGA'=17)) +
scale_colour_manual(values=c('gag'='red','pol'="grey60", 'gag+pol+env'="#3F4788FF")) +
scale_y_continuous(labels = scales::percent, limit=c(0.5,1)) +
labs( x='\nGappiness of full-genome sequences',
y='phylogenetically closest pairs of individuals\nthat are transmission pairs, out of all such pairs\nthat can be identified in the true tree\n',
colour='part of genome used\nfor tree reconstruction',
pch='algorithm') +
theme_bw() + theme(legend.position='bottom')
file <- file.path(edir, paste(timetag,'_','pTransPairRecovered_by_gaps.pdf',sep=''))
ggsave(file=file, w=6, h=7)
ggplot(subset(sa, ACUTE=='10%' & TEAM!='MetaPIGA' & MODEL=='Model: Regional'), aes(x=TAXAN)) +
geom_jitter(aes(y=NRF, colour=GENE, pch=TEAM), position=position_jitter(w=0.8, h = 0), size=2) +
scale_colour_manual(values=c('gag'='red','pol'="grey60", 'gag+pol+env'="#3F4788FF")) +
scale_y_continuous(labels = scales::percent, expand=c(0,0), limits=c(0, 1)) +
scale_shape_manual(values=c('IQTree'=15, 'PhyML'=12, 'RAXML'=8)) +
labs( x='\nnumber of taxa in simulated tree',
y='RF distance\n(standardized)\n',
colour='part of genome used\nfor tree reconstruction',
pch='algorithm') +
theme_bw() + theme(legend.position='bottom') +
facet_grid(~GAPS)
file <- file.path(edir, paste(timetag,'_','RF_fulltree_polvsall_by_gaps_taxan_RegionalAcute10pc.pdf',sep=''))
ggsave(file=file, w=5, h=7)
ggplot(subset(sa, ACUTE=='40%' & TEAM!='MetaPIGA'), aes(x=TAXAN)) +
geom_jitter(aes(y=NRF, colour=GENE, pch=TEAM), position=position_jitter(w=0.8, h = 0), size=2) +
scale_colour_manual(values=c('pol'="grey60", 'gag+pol+env'="#3F4788FF")) +
scale_y_continuous(labels = scales::percent, expand=c(0,0), limits=c(0, 1)) +
scale_shape_manual(values=c('IQTree'=15, 'PhyML'=12, 'RAXML'=8)) +
labs( x='\nnumber of taxa in simulated tree',
y='RF distance\n(standardized)\n',
colour='part of genome used\nfor tree reconstruction',
pch='algorithm') +
theme_bw() + theme(legend.position='bottom') +
facet_grid(~GAPS)
file <- file.path(edir, paste(timetag,'_','RF_fulltree_polvsall_by_gaps_taxan_AllAcute40pc.pdf',sep=''))
ggsave(file=file, w=5, h=7)
ggplot(subset(sa, TEAM!='MetaPIGA'), aes(x=TAXAN)) +
geom_jitter(aes(y=NQD, colour=GENE, pch=TEAM), position=position_jitter(w=0.8, h = 0), size=2) +
scale_colour_manual(values=c('gag'='red','pol'="grey60", 'gag+pol+env'="#3F4788FF")) +
scale_y_continuous(labels = scales::percent, expand=c(0,0), limits=c(0, 1)) +
scale_shape_manual(values=c('IQTree'=15, 'PhyML'=12, 'RAXML'=8)) +
labs( x='\nnumber of taxa in simulated tree',
y='Quartet distance\n(standardized)\n',
colour='part of genome used\nfor tree reconstruction',
pch='algorithm') +
theme_bw() + theme(legend.position='bottom') +
facet_grid(~GAPS)
file <- file.path(edir, paste(timetag,'_','QD_polvsall_by_gaps_taxan_AllAcute40pc.pdf',sep=''))
ggsave(file=file, w=5, h=7)
ggplot(subset(sa, TEAM!='MetaPIGA' & ACUTE=='10%')) +
geom_jitter(aes(x=GAPS, y=PD, colour=GENE, pch=TEAM), position=position_jitter(w=0.8, h = 0), size=2) +
#geom_boxplot(aes(x=cut(TAXAN, breaks=seq(800,1601,200)), y=NPD, colour=GENE)) +
scale_colour_manual(values=c('gag'='red','pol'="grey60", 'gag+pol+env'="#3F4788FF")) +
#scale_y_continuous(labels = scales::percent, expand=c(0,0), limits=c(0, 0.03)) +
scale_shape_manual(values=c('IQTree'=15, 'PhyML'=12, 'RAXML'=8)) +
labs( x='\nGappiness of full-genome sequences',
y='path distance\n',
colour='part of genome used\nfor tree reconstruction',
pch='algorithm') +
theme_bw() + theme(legend.position='bottom') +
facet_grid(TEAM~.)
file <- file.path(edir, paste(timetag,'_','PD_polvsall_by_gaps_taxan1600_Acute10pc.pdf',sep=''))
ggsave(file=file, w=5, h=10)
}
##--------------------------------------------------------------------------------------------------------
## olli 03.12.15
##--------------------------------------------------------------------------------------------------------
treecomparison.ana.160502<- function()
{
require(ggplot2)
require(data.table)
require(ape)
require(scales)
require(ggtree)
require(phangorn)
edir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/evaluation'
#timetag <- '160502'
#load(paste(edir,'/','submitted_160430.rda',sep=''))
timetag <- '160627'
load(paste(edir,'/','submitted_160627.rda',sep=''))
#
# plot trees
#
tmpdir <- '~/duke/tmp'
sc_id <- '150701_REGIONAL_TRAIN5'
gene_id <- 'GAG+POL+ENV'
team_id <- 'RAXML'
for(sc_id in c("150701_REGIONAL_TRAIN1","150701_REGIONAL_TRAIN2","150701_REGIONAL_TRAIN3","150701_REGIONAL_TRAIN4","150701_REGIONAL_TRAIN5"))
{
for(team_id in c("IQTree","RAXML"))
{
for(gene_id in submitted.info[, unique(GENE)])
{
#gene_id<- 'POL'
tmp <- subset(submitted.info, TEAM==team_id & SC==sc_id & GENE==gene_id)
if(team_id=="MetaPIGA")
tmp <- subset(tmp, grepl("best solution_use",FILE))
phr <- ttrs[[ tmp[1, SUB_IDX_T] ]]
phs <- lapply(tmp[, IDX], function(i) strs[[i]] )
# drop to common tips
z <- setdiff(phr$tip.label, phs[[1]]$tip.label)
#stopifnot( length(z)==abs(diff(c(TAXAN, TAXAN_T))) )
if(length(z))
phr <- drop.tip(phr, z, rooted=TRUE, root.edge=1)
phr <- multi2di(phr,random =FALSE)
pho <- treedist.get.tree.100bs(phr, phs, tmpdir)
ggtree(pho, layout="fan", aes(color=bootstrap), size=0.2) +
scale_colour_continuous(low="grey70", high="#1B0C42FF", guide="none") +
#scale_colour_continuous(low="#1B0C42FF", high="#D64B40FF", guide="none") +
theme_tree2(legend.position='right') + theme(axis.text=element_blank())
file <- file.path(edir, paste(timetag, '_AgreeTree100bs_', sc_id, '_', team_id, '_', gene_id, '.pdf', sep=''))
ggsave(file=file, w=6, h=6)
}
}
}
# average gappiness
#
# plot prob closest
#
sa <- copy(submitted.info)
#
set(sa, NULL, 'MODEL', sa[, factor(MODEL, levels=c('V','R'),labels=c('Model: Village','Model: Regional'))])
set(sa, sa[, which(SC=="VILL_99_APR15")],'SC',"150701_VILL_SCENARIO-C")
set(sa, NULL, 'SC', sa[, factor(SC, levels=c("150701_REGIONAL_TRAIN1", "150701_REGIONAL_TRAIN2", "150701_REGIONAL_TRAIN3", "150701_REGIONAL_TRAIN4","150701_REGIONAL_TRAIN5","150701_VILL_SCENARIO-A","150701_VILL_SCENARIO-B","150701_VILL_SCENARIO-C","150701_VILL_SCENARIO-D","150701_VILL_SCENARIO-E"),
labels=c('sc 1','sc 2','sc 3','sc 4','sc 5','sc A','sc B','sc C','sc D','sc E'))])
set(sa, NULL, 'GAPS', sa[, factor(GAPS, levels=c('none','low','high'),labels=c('none','as for Botswana\nsequences','as for Uganda\nsequences'))])
set(sa, NULL, 'BEST', sa[, factor(BEST, levels=c('Y','N'),labels=c('best tree','replicate tree'))])
set(sa, NULL, 'GENE', sa[, factor(GENE, levels=c('POL','GAG+POL+ENV'),labels=c('pol','gag+pol+env'))])
set(sa, NULL, 'TEAM', sa[, factor(TEAM, levels=sa[, sort(unique(TEAM))],labels=sa[, sort(unique(TEAM))])])
set(sa, NULL, 'EXT', sa[, factor(EXT, levels=c('~0pc','5pc'),labels=c('~ 0%/year','5%/year'))])
set(sa, NULL, 'ACUTE', sa[, factor(ACUTE, levels=c('low','high'),labels=c('10%','40%'))])
set(sa, NULL, 'ART', sa[, factor(ART, levels=c('none','fast'),labels=c('none','fast'))])
sa <- subset(sa, OTHER=='N')
#
subset(sa, MODEL=='Model: Regional')[, table(TEAM, GENE, SC)]
#
ggplot(subset(sa, !is.na(TR_REC_perc) & ACUTE=='10%'), aes(x=GAPS)) +
geom_jitter(aes(y=TR_REC_perc, colour=GENE, pch=TEAM), position=position_jitter(w=0.8, h = 0), size=2) +
scale_shape_manual(values=c('IQTree'=15, 'PhyML'=12, 'RAXML'=8, 'MetaPIGA'=17)) +
geom_point(aes(y=TR_REC_perc_T), colour="#D64B40FF", size=2) +
scale_colour_manual(values=c('pol'="grey60", 'gag+pol+env'="#3F4788FF")) +
scale_y_continuous(labels = scales::percent, limit=c(0,0.1), expand=c(0,0)) +
labs( x='\nGappiness of full-genome sequences',
y='phylogenetically closest individual is\ntransmitter or next infected\n',
colour='part of genome used\nfor tree reconstruction',
pch='algorithm') +
theme_bw() + theme(legend.position='bottom')
file <- file.path(edir, paste(timetag,'_','pTransRec_by_gaps.pdf',sep=''))
ggsave(file=file, w=5, h=7)
#
# Quartett distance
#
load(paste(edir,'/','submitted_151101_BLQDKC.rda',sep=''))
sc <- copy(sclu.info)
#
tmp <- subset(tinfo, !is.na(IDCLU))[, list(CLU_N=CLU_N[1], MXGPS_CLU= max(GPS), MDGPS_CLU=median(GPS)), by=c('SC','IDCLU')]
sc <- merge(sc, tmp, by=c('SC','IDCLU'))
set(sc, NULL, 'MODEL', sc[, factor(MODEL, levels=c('V','R'),labels=c('Model: Village','Model: Regional'))])
set(sc, sc[, which(SC=="VILL_99_APR15")],'SC',"150701_VILL_SCENARIO-C")
set(sc, NULL, 'SC', sc[, factor(SC, levels=c("150701_REGIONAL_TRAIN1", "150701_REGIONAL_TRAIN2", "150701_REGIONAL_TRAIN3", "150701_REGIONAL_TRAIN4","150701_REGIONAL_TRAIN5","150701_VILL_SCENARIO-A","150701_VILL_SCENARIO-B","150701_VILL_SCENARIO-C","150701_VILL_SCENARIO-D","150701_VILL_SCENARIO-E"),
labels=c('sc 1','sc 2','sc 3','sc 4','sc 5','sc A','sc B','sc C','sc D','sc E'))])
set(sc, NULL, 'GAPS', sc[, factor(GAPS, levels=c('none','low','high'),labels=c('none','as for Botswana\nsequences','as for Uganda\nsequences'))])
set(sc, NULL, 'BEST', sc[, factor(BEST, levels=c('Y','N'),labels=c('best tree','replicate tree'))])
set(sc, NULL, 'GENE', sc[, factor(GENE, levels=c('POL','GAG+POL+ENV'),labels=c('pol','gag+pol+env'))])
set(sc, NULL, 'TEAM', sc[, factor(TEAM, levels=sc[, sort(unique(TEAM))],labels=sc[, sort(unique(TEAM))])])
set(sc, NULL, 'EXT', sc[, factor(EXT, levels=c('~0pc','5pc'),labels=c('~ 0%/year','5%/year'))])
set(sc, NULL, 'ART', sc[, factor(ART, levels=c('none','fast'),labels=c('none','fast'))])
sc <- subset(sc, OTHER=='N')
sc <- sc[, list( SB_NQD=mean(NQDC, na.rm=TRUE) ), by=c('SC','GENE','TEAM','BEST','IDX','FILE','GAPS','MODEL','TAXAN','TAXAN_T','ROOTED','SEQCOV','ART','ACUTE','EXT','OTHER')]
sc <- subset(sc, MODEL=='Model: Regional')
ggplot(subset(sc, ACUTE=='low' & TEAM!='MetaPIGA'), aes(x=GAPS)) +
geom_jitter(aes(y=SB_NQD, colour=GENE, pch=TEAM), position=position_jitter(w=0.8, h = 0), size=2) +
scale_colour_manual(values=c('pol'="grey60", 'gag+pol+env'="#3F4788FF")) +
scale_y_continuous(labels = scales::percent, expand=c(0,0), limits=c(0, 0.4)) +
scale_shape_manual(values=c('IQTree'=15, 'PhyML'=12, 'RAXML'=8)) +
labs( x='\nGappiness of full-genome sequences',
y='Quartett distance\n(standardized)\n',
colour='part of genome used\nfor tree reconstruction',
pch='algorithm') +
theme_bw() + theme(legend.position='bottom')
file <- file.path(edir, paste(timetag,'_','QD_polvsall_by_gaps.pdf',sep=''))
ggsave(file=file, w=5, h=7)
}
##--------------------------------------------------------------------------------------------------------
## olli 03.12.15
##--------------------------------------------------------------------------------------------------------
treecomparison.ana.151203<- function()
{
require(ggplot2)
require(gamlss)
require(scales)
edir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/evaluation'
timetag <- '151203'
load(paste(edir,'/','submitted_151203.rda',sep=''))
#
#
#
sa <- copy(submitted.info)
sc <- copy(sclu.info)
#
set(sa, NULL, 'MODEL', sa[, factor(MODEL, levels=c('V','R'),labels=c('Model: Village','Model: Regional'))])
set(sa, sa[, which(SC=="VILL_99_APR15")],'SC',"150701_VILL_SCENARIO-C")
set(sa, NULL, 'SC', sa[, factor(SC, levels=c("150701_REGIONAL_TRAIN1", "150701_REGIONAL_TRAIN2", "150701_REGIONAL_TRAIN3", "150701_REGIONAL_TRAIN4","150701_REGIONAL_TRAIN5","150701_VILL_SCENARIO-A","150701_VILL_SCENARIO-B","150701_VILL_SCENARIO-C","150701_VILL_SCENARIO-D","150701_VILL_SCENARIO-E"),
labels=c('sc 1','sc 2','sc 3','sc 4','sc 5','sc A','sc B','sc C','sc D','sc E'))])
set(sa, NULL, 'GAPS', sa[, factor(GAPS, levels=c('none','low','high'),labels=c('Gaps: none','Gaps: low','Gaps: high'))])
set(sa, NULL, 'BEST', sa[, factor(BEST, levels=c('Y','N'),labels=c('best tree','replicate tree'))])
set(sa, NULL, 'GENE', sa[, factor(GENE, levels=c('POL','GAG+POL+ENV'),labels=c('pol','gag+pol+env'))])
set(sa, NULL, 'TEAM', sa[, factor(TEAM, levels=sa[, sort(unique(TEAM))],labels=sa[, sort(unique(TEAM))])])
set(sa, NULL, 'EXT', sa[, factor(EXT, levels=c('~0pc','5pc'),labels=c('~ 0%/year','5%/year'))])
set(sa, NULL, 'ACUTE', sa[, factor(ACUTE, levels=c('low','high'),labels=c('10%','40%'))])
set(sa, NULL, 'ART', sa[, factor(ART, levels=c('none','fast'),labels=c('none','fast'))])
sa <- subset(sa, OTHER=='N')
#
tmp <- subset(tinfo, !is.na(IDCLU))[, list(CLU_N=CLU_N[1], MXGPS_CLU= max(GPS), MDGPS_CLU=median(GPS)), by=c('SC','IDCLU')]
sc <- merge(sc, tmp, by=c('SC','IDCLU'))
set(sc, NULL, 'MODEL', sc[, factor(MODEL, levels=c('V','R'),labels=c('Model: Village','Model: Regional'))])
set(sc, sc[, which(SC=="VILL_99_APR15")],'SC',"150701_VILL_SCENARIO-C")
set(sc, NULL, 'SC', sc[, factor(SC, levels=c("150701_REGIONAL_TRAIN1", "150701_REGIONAL_TRAIN2", "150701_REGIONAL_TRAIN3", "150701_REGIONAL_TRAIN4","150701_REGIONAL_TRAIN5","150701_VILL_SCENARIO-A","150701_VILL_SCENARIO-B","150701_VILL_SCENARIO-C","150701_VILL_SCENARIO-D","150701_VILL_SCENARIO-E"),
labels=c('sc 1','sc 2','sc 3','sc 4','sc 5','sc A','sc B','sc C','sc D','sc E'))])
set(sc, NULL, 'GAPS', sc[, factor(GAPS, levels=c('none','low','high'),labels=c('Gaps: none','Gaps: low','Gaps: high'))])
set(sc, NULL, 'BEST', sc[, factor(BEST, levels=c('Y','N'),labels=c('best tree','replicate tree'))])
set(sc, NULL, 'GENE', sc[, factor(GENE, levels=c('POL','GAG+POL+ENV'),labels=c('pol','gag+pol+env'))])
set(sc, NULL, 'TEAM', sc[, factor(TEAM, levels=sc[, sort(unique(TEAM))],labels=sc[, sort(unique(TEAM))])])
set(sc, NULL, 'EXT', sc[, factor(EXT, levels=c('~0pc','5pc'),labels=c('~ 0%/year','5%/year'))])
set(sc, NULL, 'ART', sc[, factor(ART, levels=c('none','fast'),labels=c('none','fast'))])
sc <- subset(sc, OTHER=='N')
#
stopifnot(sc[, !any(is.na(SB_NRFC))], sc[, !any(is.na(SB_NQDC))])
scp <- sc[, list( SB_NRF=mean(SB_NRFC, na.rm=TRUE), SB_NQD=mean(SB_NQDC, na.rm=TRUE) ), by=c('SC','GENE','TEAM','BEST','IDX','FILE','GAPS','MODEL','TAXAN','TAXAN_T','ROOTED','SEQCOV','ART','ACUTE','EXT','OTHER','TIME_IDX_T','SUB_IDX_T')]
if('BILL'%in%colnames(sc))
{
tmp <- sc[, list( BILL=mean(BILL, na.rm=TRUE) ), by=c('SC','GENE','TEAM','BEST','IDX','FILE','GAPS','MODEL','TAXAN','TAXAN_T','ROOTED','SEQCOV','ART','ACUTE','EXT','OTHER','TIME_IDX_T','SUB_IDX_T')]
scp <- merge(scp, tmp, by=c('SC','GENE','TEAM','BEST','IDX','FILE','GAPS','MODEL','TAXAN','TAXAN_T','ROOTED','SEQCOV','ART','ACUTE','EXT','OTHER','TIME_IDX_T','SUB_IDX_T') )
}
if('LSD_NRFC'%in%colnames(sc))
{
tmp <- sc[, list( LSD_NRF=mean(LSD_NRFC, na.rm=TRUE) ), by=c('SC','GENE','TEAM','BEST','IDX','FILE','GAPS','MODEL','TAXAN','TAXAN_T','ROOTED','SEQCOV','ART','ACUTE','EXT','OTHER','TIME_IDX_T','SUB_IDX_T')]
scp <- merge(scp, tmp, by=c('SC','GENE','TEAM','BEST','IDX','FILE','GAPS','MODEL','TAXAN','TAXAN_T','ROOTED','SEQCOV','ART','ACUTE','EXT','OTHER','TIME_IDX_T','SUB_IDX_T') )
}
sm <- rbind( subset(sa, grepl('Village',MODEL)), scp, fill=TRUE, use.names=TRUE)
#
# PRIMARY OBJECTIVE
#
# polvsall by gaps
#
if('NRF'%in%colnames(sm))
{
ggplot( subset(sm, TEAM!='MetaPIGA'), aes(y=SB_NRF, x=SC, shape=TEAM, fill=GENE, colour=GENE, size=BEST) ) +
geom_jitter(position = position_jitter(height=.001, width=0.2)) +
scale_size_manual(values=c(3, 1)) +
scale_shape_manual(values=c(21,23,24)) +
scale_fill_brewer(palette='Paired') +
scale_colour_brewer(palette='Paired') +
facet_wrap(MODEL~GAPS, scales='free_x') +
labs(x='\nsimulated data set', y='Robinson-Fould distance\nof estimated trees with subst/site branches\n(standardized)\n', size='', shape='Method', fill='part of genome', colour='part of genome') +
theme_bw()
ggsave(w=10, h=6, file=paste(edir,'/',timetag,'_RF_SUBST_polvsall_by_gaps.pdf',sep=''))
ggplot( subset(sm, TEAM!='MetaPIGA'), aes(y=LSD_NRF, x=SC, shape=TEAM, fill=GENE, colour=GENE, size=BEST) ) +
geom_jitter(position = position_jitter(height=.001, width=0.2)) +
scale_size_manual(values=c(3, 1)) +
scale_shape_manual(values=c(21,23,24)) +
scale_fill_brewer(palette='Paired') +
scale_colour_brewer(palette='Paired') +
facet_wrap(MODEL~GAPS, scales='free_x') +
labs(x='\nsimulated data set', y='Robinson-Fould distance\nof estimated trees with dated branches\n(standardized)\n', size='', shape='Method', fill='part of genome', colour='part of genome') +
theme_bw()
ggsave(w=10, h=6, file=paste(edir,'/',timetag,'_RF_DATED_polvsall_by_gaps.pdf',sep=''))
}
if('LSD_KC_L1'%in%colnames(sa))
{
ggplot( subset(sa, TEAM!='MetaPIGA'), aes(y=LSD_KC_L1/TAXAN/TAXAN, x=SC) ) +
geom_boxplot(aes(colour=GENE), fill='transparent', size=0.5, outlier.shape=NA, alpha=0.3) +
geom_jitter(aes(shape=TEAM, fill=GENE, colour=GENE, size=BEST), position = position_jitter(height=.001, width=0.2)) +
scale_size_manual(values=c(3, 1)) +
scale_shape_manual(values=c(21,23,24)) +
scale_fill_brewer(palette='Paired') +
scale_colour_brewer(palette='Paired') +
facet_wrap(MODEL~GAPS, scales='free_x') +
labs(x='\nsimulated data set', y='Kendall-Colijn\nof estimated trees with dated branches\n(lambda=1, /TX^2)\n', size='', shape='Method', fill='part of genome', colour='part of genome') +
theme_bw()
ggsave(w=10, h=6, file=paste(edir,'/',timetag,'_KC1_DATED_polvsall_by_gaps.pdf',sep=''))
ggplot( subset(sa, TEAM!='MetaPIGA'), aes(y=LSD_KC_L0/TAXAN/TAXAN, x=SC) ) +
geom_boxplot(aes(colour=GENE), fill='transparent', size=0.5, outlier.shape=NA, alpha=0.3) +
geom_jitter(aes(shape=TEAM, fill=GENE, colour=GENE, size=BEST), position = position_jitter(height=.001, width=0.2)) +
scale_size_manual(values=c(3, 1)) +
scale_shape_manual(values=c(21,23,24)) +
scale_fill_brewer(palette='Paired') +
scale_colour_brewer(palette='Paired') +
facet_wrap(MODEL~GAPS, scales='free_x') +
labs(x='\nsimulated data set', y='Kendall-Colijn\nof estimated trees with subst/site branches\n(lambda=0, /TX^2)\n', size='', shape='Method', fill='part of genome', colour='part of genome') +
theme_bw()
ggsave(w=10, h=6, file=paste(edir,'/',timetag,'_KC0_SUBST_polvsall_by_gaps.pdf',sep=''))
}
#
# correlation between KC0 and KC1
#
ggplot( subset(sa, TEAM!='MetaPIGA'), aes(x=LSD_KC_L0, y=LSD_KC_L1) ) +
geom_point(aes(shape=TEAM, colour=SC, size=BEST)) + geom_abline(slope=1, intercept=0) +
scale_colour_brewer(palette='Paired') + scale_size_manual(values=c(3, 1)) + scale_shape_manual(values=c(21,23,24)) +
facet_grid(MODEL~GAPS) +
labs(y='Kendall-Colijn\nof estimated trees with dated branches\n(lambda=1, branch lengths)\n', x='Kendall-Colijn\nof estimated trees with dated branches\n(lambda=0, topology)\n', size='', shape='Method', colour='simulated data set') +
theme_bw()
ggsave(w=10, h=6, file=paste(edir,'/',timetag,'_KC0_KC1_correlation.pdf',sep=''))
#
# KC discrepancy due to taxa size? .. cannot rule out ..
#
ggplot( subset(sa, MODEL=="Model: Village" & TEAM!='MetaPIGA'), aes(y=LSD_KC_L0/TAXAN/TAXAN, x=SC, shape=TEAM, fill=TAXAN, colour=TAXAN, size=BEST) ) +
geom_jitter(position = position_jitter(height=.001, width=0.2)) +
scale_size_manual(values=c(3, 1)) +
scale_shape_manual(values=c(21,23,24)) +
scale_colour_gradientn(colours = rainbow(7)) +
scale_fill_gradientn(colours = rainbow(7)) +
facet_grid(GENE~GAPS) +
labs(title="Model: Village\n", x='\nsimulated data set', y='Kendall-Colijn\nof estimated trees with dated branches\n(lambda=0, topology)\n', size='', shape='Method', fill='Taxa in subm tree', colour='Taxa in subm tree') +
theme_bw()
ggsave(w=10, h=6, file=paste(edir,'/',timetag,'_KC0_polvsall_Village_by_TAXAN.pdf',sep=''))
ggplot( subset(sa, MODEL=="Model: Village" & TEAM!='MetaPIGA'), aes(y=LSD_NRF, x=SC, shape=TEAM, fill=TAXAN, colour=TAXAN, size=BEST) ) +
geom_jitter(position = position_jitter(height=.001, width=0.2)) +
scale_size_manual(values=c(3, 1)) +
scale_shape_manual(values=c(21,23,24)) +
scale_colour_gradientn(colours = rainbow(7)) +
scale_fill_gradientn(colours = rainbow(7)) +
facet_grid(GENE~GAPS) +
labs(title="Model: Village\n", x='\nsimulated data set', y='Robinson-Fould\nof estimated trees with dated branches\n', size='', shape='Method', fill='Taxa in subm tree', colour='Taxa in subm tree') +
theme_bw()
ggsave(w=10, h=6, file=paste(edir,'/',timetag,'_RF_polvsall_Village_by_TAXAN.pdf',sep=''))
#
# not an issue for RF because we see same signal in regional where taxa size is constant
#
ggplot( subset(sa, MODEL=="Model: Regional" & TEAM!='MetaPIGA'), aes(y=NRF, x=SC, shape=TEAM, fill=TAXAN, colour=TAXAN, size=BEST) ) +
geom_jitter(position = position_jitter(height=.001, width=0.2)) +
scale_size_manual(values=c(3, 1)) +
scale_shape_manual(values=c(21,23,24)) +
#scale_fill_brewer(palette='Paired') +
#scale_colour_brewer(palette='Paired') +
facet_wrap(GENE~GAPS, scales='free_x', ncol=3) +
labs(title="Model: Regional\n", x='\nsimulated data set', y='Robinson-Fould\n(standardized)\n', size='', shape='Method', fill='Taxa in subm tree', colour='Taxa in subm tree') +
theme_bw()
ggsave(w=10, h=6, file=paste(edir,'/',timetag,'_RF_polvsall_by_TAXAN_R.pdf',sep=''))
#
# plot 2D for RF
#
ggplot(subset(sa, SC=='sc 5'), aes(x=LSD_RF_MDSx/(2*TAXAN-6), y=LSD_RF_MDSy/(2*TAXAN-6))) +
geom_point(aes(colour=TEAM, shape=GENE, size=BEST)) +
geom_point(x=0, y=0, colour='black') +
scale_shape_manual(values=c(17,18)) + scale_size_manual(values=c(3, 1)) +
scale_fill_brewer(palette='Paired') + scale_colour_brewer(palette='Set1') +
labs(x='', y='') +
theme_bw()
}
##--------------------------------------------------------------------------------------------------------
## olli 19.11.15
##--------------------------------------------------------------------------------------------------------
treecomparison.ana.151019<- function()
{
require(ggplot2)
require(gamlss)
edir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/evaluation'
if(0)
{
timetag <- '151101'
file <- paste(edir,'/','submitted_',timetag,'.rda',sep='')
file <- paste(edir,'/','submitted_',timetag,'_BLQDKC.rda',sep='')
load(file)
sa <- copy(submitted.info)
sc <- copy(sclu.info)
}
if(0)
{
file <- paste(edir,'/','submitted_151016_CC.rda',sep='')
load(file)
sa <- copy(myinfo)
sc <- copy(sclu.info)
}
if(0)
{
timetag <- '151101'
file <- paste(edir,'/','submitted_',timetag,'_QD.rda',sep='')
load(file)
tmp <- copy(submitted.info)
tmp2 <- copy(sclu.info)
file <- paste(edir,'/','submitted_',timetag,'_BL.rda',sep='')
load(file)
submitted.info <- merge(submitted.info, subset(tmp, select=c('IDX','NQD')), by='IDX')
sclu.info[, BILL.x:=NULL]
setnames(sclu.info, 'BILL.y','BILL')
sclu.info <- merge(sclu.info, subset(tmp2, select=c('IDX','IDCLU','NQDC')), by=c('IDX','IDCLU'))
outfile <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/evaluation/submitted_151101_BLQD.rda'
save(strs, ttrs, tinfo, submitted.info, sclu.info, file=outfile)
}
set(sa, NULL, 'MODEL', sa[, factor(MODEL, levels=c('V','R'),labels=c('Model: Village','Model: Regional'))])
set(sa, sa[, which(SC=="VILL_99_APR15")],'SC',"150701_VILL_SCENARIO-C")
set(sa, NULL, 'SC', sa[, factor(SC, levels=c("150701_REGIONAL_TRAIN1", "150701_REGIONAL_TRAIN2", "150701_REGIONAL_TRAIN3", "150701_REGIONAL_TRAIN4","150701_REGIONAL_TRAIN5","150701_VILL_SCENARIO-A","150701_VILL_SCENARIO-B","150701_VILL_SCENARIO-C","150701_VILL_SCENARIO-D","150701_VILL_SCENARIO-E"),
labels=c('sc 1','sc 2','sc 3','sc 4','sc 5','sc A','sc B','sc C','sc D','sc E'))])
set(sa, NULL, 'GAPS', sa[, factor(GAPS, levels=c('none','low','high'),labels=c('Gaps: none','Gaps: low','Gaps: high'))])
set(sa, NULL, 'BEST', sa[, factor(BEST, levels=c('Y','N'),labels=c('best tree','replicate tree'))])
set(sa, NULL, 'GENE', sa[, factor(GENE, levels=c('POL','GAG+POL+ENV'),labels=c('pol','gag+pol+env'))])
set(sa, NULL, 'TEAM', sa[, factor(TEAM, levels=sa[, sort(unique(TEAM))],labels=sa[, sort(unique(TEAM))])])
set(sa, NULL, 'EXT', sa[, factor(EXT, levels=c('~0pc','5pc'),labels=c('~ 0%/year','5%/year'))])
set(sa, NULL, 'ACUTE', sa[, factor(ACUTE, levels=c('low','high'),labels=c('10%','40%'))])
set(sa, NULL, 'ART', sa[, factor(ART, levels=c('none','fast'),labels=c('none','fast'))])
sa <- subset(sa, OTHER=='N')
tmp <- subset(tinfo, !is.na(IDCLU))[, list(CLU_N=CLU_N[1], MXGPS_CLU= max(GPS), MDGPS_CLU=median(GPS)), by=c('SC','IDCLU')]
sc <- merge(sc, tmp, by=c('SC','IDCLU'))
set(sc, NULL, 'MODEL', sc[, factor(MODEL, levels=c('V','R'),labels=c('Model: Village','Model: Regional'))])
set(sc, sc[, which(SC=="VILL_99_APR15")],'SC',"150701_VILL_SCENARIO-C")
set(sc, NULL, 'SC', sc[, factor(SC, levels=c("150701_REGIONAL_TRAIN1", "150701_REGIONAL_TRAIN2", "150701_REGIONAL_TRAIN3", "150701_REGIONAL_TRAIN4","150701_REGIONAL_TRAIN5","150701_VILL_SCENARIO-A","150701_VILL_SCENARIO-B","150701_VILL_SCENARIO-C","150701_VILL_SCENARIO-D","150701_VILL_SCENARIO-E"),
labels=c('sc 1','sc 2','sc 3','sc 4','sc 5','sc A','sc B','sc C','sc D','sc E'))])
set(sc, NULL, 'GAPS', sc[, factor(GAPS, levels=c('none','low','high'),labels=c('Gaps: none','Gaps: low','Gaps: high'))])
set(sc, NULL, 'BEST', sc[, factor(BEST, levels=c('Y','N'),labels=c('best tree','replicate tree'))])
set(sc, NULL, 'GENE', sc[, factor(GENE, levels=c('POL','GAG+POL+ENV'),labels=c('pol','gag+pol+env'))])
set(sc, NULL, 'TEAM', sc[, factor(TEAM, levels=sc[, sort(unique(TEAM))],labels=sc[, sort(unique(TEAM))])])
set(sc, NULL, 'EXT', sc[, factor(EXT, levels=c('~0pc','5pc'),labels=c('~ 0%/year','5%/year'))])
set(sc, NULL, 'ART', sc[, factor(ART, levels=c('none','fast'),labels=c('none','fast'))])
sc <- subset(sc, OTHER=='N')
stopifnot(sc[, !any(is.na(NRFC))], sc[, !any(is.na(NQDC))])
scp <- sc[, list( NRF=mean(NRFC, na.rm=TRUE), NQD=mean(NQDC, na.rm=TRUE) ), by=c('SC','GENE','TEAM','BEST','IDX','FILE','GAPS','MODEL','TAXAN','TAXAN_T','ROOTED','SEQCOV','ART','ACUTE','EXT','OTHER','TIME_IDX_T','SUB_IDX_T')]
if('BILL'%in%colnames(sc))
{
tmp <- sc[, list( BILL=mean(BILL, na.rm=TRUE) ), by=c('SC','GENE','TEAM','BEST','IDX','FILE','GAPS','MODEL','TAXAN','TAXAN_T','ROOTED','SEQCOV','ART','ACUTE','EXT','OTHER','TIME_IDX_T','SUB_IDX_T')]
scp <- merge(scp, tmp, by=c('SC','GENE','TEAM','BEST','IDX','FILE','GAPS','MODEL','TAXAN','TAXAN_T','ROOTED','SEQCOV','ART','ACUTE','EXT','OTHER','TIME_IDX_T','SUB_IDX_T') )
}
sm <- rbind( subset(sa, grepl('Village',MODEL)), scp, fill=TRUE, use.names=TRUE)
#
# polvsall by gaps
# -->
# all leads to improvements throughout
# with gaps, the topology without branch lengths is increasingly difficult to estimate
# regional overall more difficult!
if('NRF'%in%colnames(sm))
{
ggplot( subset(sm, TEAM!='MetaPIGA'), aes(y=NRF, x=SC, shape=TEAM, fill=GENE, colour=GENE, size=BEST) ) +
geom_jitter(position = position_jitter(height=.001, width=0.2)) +
scale_size_manual(values=c(3, 1)) +
scale_shape_manual(values=c(21,23,24)) +
scale_fill_brewer(palette='Paired') +
scale_colour_brewer(palette='Paired') +
facet_wrap(MODEL~GAPS, scales='free_x') +
labs(x='\nsimulated data set', y='Robinson-Fould\n(standardized)\n', size='', shape='Method', fill='part of genome', colour='part of genome') +
theme_bw()
ggsave(w=10, h=6, file=paste(edir,'/',timetag,'_RF_polvsall_by_gaps.pdf',sep=''))
}
if('BILL'%in%colnames(sm))
{
ggplot( subset(sm, TEAM!='MetaPIGA'), aes(y=BILL, x=SC, shape=TEAM, fill=GENE, colour=GENE, size=BEST) ) +
geom_jitter(position = position_jitter(height=.001, width=0.2)) +
scale_size_manual(values=c(3, 1)) +
scale_shape_manual(values=c(21,23,24)) +
scale_fill_brewer(palette='Paired') +
scale_colour_brewer(palette='Paired') +
facet_wrap(MODEL~GAPS, scales='free') +
labs(x='\nsimulated data set', y='Geodesic\n(raw)\n', size='', shape='Method', fill='part of genome', colour='part of genome') +
theme_bw()
ggsave(w=10, h=6, file=paste(edir,'/',timetag,'_BL_polvsall_by_gaps.pdf',sep=''))
}
if('NQD'%in%colnames(sm))
{
ggplot( subset(sm, TEAM!='MetaPIGA'), aes(y=NQD, x=SC, shape=TEAM, fill=GENE, colour=GENE, size=BEST) ) +
geom_jitter(position = position_jitter(height=.001, width=0.2)) +
scale_size_manual(values=c(3, 1)) +
scale_shape_manual(values=c(21,23,24)) +
scale_fill_brewer(palette='Paired') +
scale_colour_brewer(palette='Paired') +
facet_wrap(MODEL~GAPS, scales='free_x') +
labs(x='\nsimulated data set', y='Quartett\n(standardized)\n', size='', shape='Method', fill='part of genome', colour='part of genome') +
theme_bw()
ggsave(w=10, h=6, file=paste(edir,'/',timetag,'_QD_polvsall_by_gaps.pdf',sep=''))
}
if('KC1'%in%colnames(sa))
{
ggplot( subset(sa, TEAM!='MetaPIGA'), aes(y=KC1/TAXAN/TAXAN, x=SC) ) +
geom_boxplot(aes(colour=GENE), fill='transparent', size=0.5, outlier.shape=NA, alpha=0.3) +
geom_jitter(aes(shape=TEAM, fill=GENE, colour=GENE, size=BEST), position = position_jitter(height=.001, width=0.2)) +
scale_size_manual(values=c(3, 1)) +
scale_shape_manual(values=c(21,23,24)) +
scale_fill_brewer(palette='Paired') +
scale_colour_brewer(palette='Paired') +
facet_wrap(MODEL~GAPS, scales='free_x') +
labs(x='\nsimulated data set', y='Kendall-Colijn\n(lambda=0, rtt if unrooted, /TX^2)\n', size='', shape='Method', fill='part of genome', colour='part of genome') +
theme_bw()
ggsave(w=10, h=6, file=paste(edir,'/',timetag,'_KC1_polvsall_by_gaps.pdf',sep=''))
ggplot( subset(sa, TEAM!='MetaPIGA'), aes(y=KC2/TAXAN/TAXAN, x=SC) ) +
geom_boxplot(aes(colour=GENE), fill='transparent', size=0.5, outlier.shape=NA, alpha=0.3) +
geom_jitter(aes(shape=TEAM, fill=GENE, colour=GENE, size=BEST), position = position_jitter(height=.001, width=0.2)) +
scale_size_manual(values=c(3, 1)) +
scale_shape_manual(values=c(21,23,24)) +
scale_fill_brewer(palette='Paired') +
scale_colour_brewer(palette='Paired') +
facet_wrap(MODEL~GAPS, scales='free_x') +
labs(x='\nsimulated data set', y='Kendall-Colijn\n(lambda=0, rtt all, /TX^2)\n', size='', shape='Method', fill='part of genome', colour='part of genome') +
theme_bw()
ggsave(w=10, h=6, file=paste(edir,'/',timetag,'_KC2_polvsall_by_gaps.pdf',sep=''))
}
if('NPD'%in%colnames(sa))
{
ggplot( subset(sa, TEAM!='MetaPIGA'), aes(y=NPD, x=SC, shape=TEAM, fill=GENE, colour=GENE, size=BEST) ) +
geom_jitter(position = position_jitter(height=.001, width=0.2)) +
scale_size_manual(values=c(3, 1)) +
scale_shape_manual(values=c(21,23,24)) +
scale_fill_brewer(palette='Paired') +
scale_colour_brewer(palette='Paired') +
facet_wrap(MODEL~GAPS, scales='free') +
labs(x='\nsimulated data set', y='Path difference\n(standardized)\n', size='', shape='Method', fill='part of genome', colour='part of genome') +
theme_bw()
ggsave(w=10, h=6, file=paste(edir,'/151023_PD_polvsall_by_gaps.pdf',sep=''))
}
# RF may be confounded by size of data set when evaluating the extent that regional is more difficult
# -->
# hard to extrapolate how standardized RF grows with size of data set,
# but regression extrapolation suggests there is an effect
if('NRF'%in%colnames(sm))
{
z <- subset(sa, TEAM!='MetaPIGA' & !grepl('Reg',MODEL))
mo <- gamlss(NRF~TAXAN+GENE+GAPS, sigma.formula=~TAXAN+GENE+GAPS, family=BE(mu.link='cauchit'), data=z)
tmp <- subset(sa, TEAM!='MetaPIGA')
tmp[, NRFP:=predict(mo, data=z, newdata=subset(tmp, select=c(TAXAN,GENE,GAPS)), what='mu',type='response')]
ggplot( tmp, aes(x=TAXAN) ) +
geom_jitter(aes(y=NRF, shape=TEAM, colour=EXT, fill=EXT, size=BEST), position = position_jitter(height=.001, width=20), alpha=0.7) +
geom_line(aes(y=NRFP), colour='black', size=0.5) +
scale_size_manual(values=c(3, 1)) +
scale_shape_manual(values=c(21,23,24)) +
scale_fill_brewer(palette='Set1') + scale_colour_brewer(palette='Set1') +
scale_y_continuous(breaks=seq(0,1,0.2), minor_breaks=seq(0,1,0.1)) +
facet_grid(GAPS~GENE) +
labs(x='\nsize of simulated data set', y='Robinson-Fould\n(standardized)\n', size='', shape='Method', fill='trms/outside', colour='trms/outside') +
theme_bw()
ggsave(w=10, h=8, file=paste(edir,'/',timetag,'_RF_trmsoutside.pdf',sep=''))
#
#
#
ggplot( subset(sa, MODEL=="Model: Village" & TEAM!='MetaPIGA'), aes(y=NRF, x=SC, shape=TEAM, fill=TAXAN, colour=TAXAN, size=BEST) ) +
geom_jitter(position = position_jitter(height=.001, width=0.2)) +
scale_size_manual(values=c(3, 1)) +
scale_shape_manual(values=c(21,23,24)) +
#scale_fill_brewer(palette='Paired') +
#scale_colour_brewer(palette='Paired') +
facet_wrap(GENE~GAPS, scales='free_x', ncol=3) +
labs(title="Model: Village\n", x='\nsimulated data set', y='Robinson-Fould\n(standardized)\n', size='', shape='Method', fill='Taxa in subm tree', colour='Taxa in subm tree') +
theme_bw()
ggsave(w=10, h=6, file=paste(edir,'/',timetag,'_RF_polvsall_by_TAXAN_V.pdf',sep=''))
ggplot( subset(sa, MODEL=="Model: Regional" & TEAM!='MetaPIGA'), aes(y=NRF, x=SC, shape=TEAM, fill=TAXAN, colour=TAXAN, size=BEST) ) +
geom_jitter(position = position_jitter(height=.001, width=0.2)) +
scale_size_manual(values=c(3, 1)) +
scale_shape_manual(values=c(21,23,24)) +
#scale_fill_brewer(palette='Paired') +
#scale_colour_brewer(palette='Paired') +
facet_wrap(GENE~GAPS, scales='free_x', ncol=3) +
labs(title="Model: Regional\n", x='\nsimulated data set', y='Robinson-Fould\n(standardized)\n', size='', shape='Method', fill='Taxa in subm tree', colour='Taxa in subm tree') +
theme_bw()
ggsave(w=10, h=6, file=paste(edir,'/',timetag,'_RF_polvsall_by_TAXAN_R.pdf',sep=''))
}
# teams
# -->
# MetaPIGA fairly bad in terms of RF
# PhyML IQTree RAxML similar in terms of RF,
# but PhyML behaved poorly when many gaps
if('NRF'%in%colnames(sm))
{
ggplot( sm, aes(y=NRF, x=SC, shape=TEAM, colour=TEAM, fill=TEAM, size=BEST) ) +
geom_jitter(position = position_jitter(height=.001, width=0.2), alpha=0.7) +
scale_size_manual(values=c(4, 1)) +
scale_shape_manual(values=c(21,22,23,24)) +
scale_fill_brewer(palette='Set1') + scale_colour_brewer(palette='Set1') +
scale_y_continuous(breaks=seq(0,1,0.2), minor_breaks=seq(0,1,0.1)) +
facet_grid(MODEL~GENE) +
labs(x='\nsimulated data set', y='Robinson-Fould\n(standardized)\n', size='', shape='Method', fill='Method', colour='Method') +
theme_bw()
ggsave(w=10, h=7, file=paste(edir,'/',timetag,'_RF_team_by_scenarioandgene.pdf',sep=''))
}
if('NQD'%in%colnames(sm))
{
ggplot( sm, aes(y=NQD, x=SC, shape=TEAM, colour=TEAM, fill=TEAM, size=BEST) ) +
geom_jitter(position = position_jitter(height = .001, width=0.2), alpha=0.7) +
scale_size_manual(values=c(4, 1)) +
scale_shape_manual(values=c(21,22,23,24)) +
scale_fill_brewer(palette='Set1') + scale_colour_brewer(palette='Set1') +
scale_y_continuous(breaks=seq(0,1,0.2), minor_breaks=seq(0,1,0.1)) +
facet_grid(MODEL~GENE) +
labs(x='\nsimulated data set', y='Quartett\n(standardized)\n', size='', shape='Method', fill='Method', colour='Method') +
theme_bw()
ggsave(w=10, h=7, file=paste(edir,'/',timetag,'_QD_team_by_scenarioandgene.pdf',sep=''))
}
if('BILL'%in%colnames(sm))
{
ggplot( sm, aes(y=BILL, x=SC, shape=TEAM, colour=TEAM, fill=TEAM, size=BEST) ) +
geom_jitter(position = position_jitter(height = .001, width=0.2), alpha=0.7) +
scale_size_manual(values=c(4, 1)) +
scale_shape_manual(values=c(21,22,23,24)) +
scale_fill_brewer(palette='Set1') + scale_colour_brewer(palette='Set1') +
#scale_y_continuous(breaks=seq(0,1,0.2), minor_breaks=seq(0,1,0.1)) +
facet_grid(MODEL~GENE, scales='free') +
labs(x='\nsimulated data set', y='Geodesic\n(raw)\n', size='', shape='Method', fill='Method', colour='Method') +
theme_bw()
ggsave(w=10, h=7, file=paste(edir,'/',timetag,'_BL_team_by_scenarioandgene.pdf',sep=''))
}
if('KC1'%in%colnames(sa))
{
ggplot( sa, aes(y=KC1/TAXAN/TAXAN, x=SC, shape=TEAM, colour=TEAM, fill=TEAM, size=BEST) ) +
geom_jitter(position = position_jitter(height=.001, width=0.2), alpha=0.7) +
scale_size_manual(values=c(4, 1)) +
scale_shape_manual(values=c(21,22,23,24)) +
scale_fill_brewer(palette='Set1') + scale_colour_brewer(palette='Set1') +
#scale_y_continuous(breaks=seq(0,1,0.2), minor_breaks=seq(0,1,0.1)) +
facet_grid(MODEL~GENE) +
labs(x='\nsimulated data set', y='Kendall-Colijn\n(lambda=0, rtt if unrooted, /TX^2)\n', size='', shape='Method', fill='Method', colour='Method') +
theme_bw()
ggsave(w=10, h=7, file=paste(edir,'/',timetag,'_KC1_team_by_scenarioandgene.pdf',sep=''))
ggplot( sa, aes(y=KC2/TAXAN/TAXAN, x=SC, shape=TEAM, colour=TEAM, fill=TEAM, size=BEST) ) +
geom_jitter(position = position_jitter(height=.001, width=0.2), alpha=0.7) +
scale_size_manual(values=c(4, 1)) +
scale_shape_manual(values=c(21,22,23,24)) +
scale_fill_brewer(palette='Set1') + scale_colour_brewer(palette='Set1') +
#scale_y_continuous(breaks=seq(0,1,0.2), minor_breaks=seq(0,1,0.1)) +
facet_grid(MODEL~GENE) +
labs(x='\nsimulated data set', y='Kendall-Colijn\n(lambda=0, rtt all, /TX^2)\n', size='', shape='Method', fill='Method', colour='Method') +
theme_bw()
ggsave(w=10, h=7, file=paste(edir,'/',timetag,'_KC2_team_by_scenarioandgene.pdf',sep=''))
}
# taxa excluded: plot cluster RF as a function of cluster size
# -->
# excluding taxa did not lead to noticeably lower RFs
if('NRFC'%in%colnames(sc))
{
tmp <- sc[, list(NRF=median(NRFC, na.rm=TRUE)), by=c('SC','GENE','TEAM','BEST','IDX','FILE','GAPS','MODEL')]
tmp <- subset(tmp, TEAM!='MetaPIGA')
ggplot( tmp, aes(y=NRF, x=SC, shape=TEAM, fill=GENE, colour=GENE, size=BEST) ) +
geom_jitter(position = position_jitter(height = .001, width=0.2)) +
scale_size_manual(values=c(3, 1)) +
scale_shape_manual(values=c(21,22,23,24)) +
scale_fill_brewer(palette='Paired') +
scale_colour_brewer(palette='Paired') +
facet_wrap(MODEL~GAPS, scales='free_x') +
labs(x='\nsimulated data set', y='median Robinson-Fould\n(standardized)\n', size='', shape='Method', fill='part of genome', colour='part of genome') +
theme_bw()
ggsave(w=10, h=6, file=paste(edir,'/',timetag,'_RFCLU_polvsall_by_gaps.pdf',sep=''))
}
ggplot( subset(sc, GENE=='gag+pol+env'), aes(y=NRFC, x=TAXA_NC, size=BEST, shape=TEAM, fill=TAXA_NC<CLU_N, colour=TAXA_NC<CLU_N)) +
geom_jitter(position = position_jitter(height=.001, width=0.1), alpha=0.7) +
scale_size_manual(values=c(3, 1)) +
scale_shape_manual(values=c(21,22,24), guide=FALSE) +
scale_fill_brewer(palette='Set1') +
scale_colour_brewer(palette='Set1') +
scale_y_continuous(breaks=seq(0,1,0.2), minor_breaks=seq(0,1,0.1)) +
#coord_trans(x='log10') +
scale_x_log10(breaks=c(1,2,3,4,5,6,8,10,20,50,100,200,300), minor_breaks=NULL) +
labs(x='\nsize of sampled transmission cluster', y='Robinson-Fould\n(standardized per transmission cluster)\n', size='', shape='Method', fill='taxa excluded\nprior to reconstruction', colour='taxa excluded\nprior to reconstruction') +
facet_grid(SC~TEAM) +
theme_bw()
ggsave(w=16, h=8, file=paste(edir,'/',timetag,'_RFCLU_iftaxaexcludedbeforetreereconstruction.pdf',sep=''))
# effect of acute in terms of RF? --> Yes
if('NRF'%in%colnames(sm))
{
ggplot( subset(sm, TEAM!='MetaPIGA' & TEAM!='PhyML' & grepl('Reg',MODEL) & !grepl('none',GAPS)), aes(y=NRF, x=ACUTE, shape=TEAM, fill=ACUTE, colour=ACUTE) ) +
geom_jitter(aes(size=BEST), position = position_jitter(height=.001, width=0.2), alpha=0.8) +
geom_boxplot(outlier.shape=NA, colour='black', alpha=0.3) +
scale_size_manual(values=c(3, 1)) +
scale_shape_manual(values=c(21,24), guide=FALSE) +
scale_fill_brewer(palette='Set1', guide=FALSE) + scale_colour_brewer(palette='Set1', guide=FALSE) +
facet_grid(GAPS~TEAM+GENE, scales='free_x') +
labs(x='\ntransmissions from those in acute infection', y='Robinson-Fould\n(standardized)\n', size='') +
theme_bw()
ggsave(w=10, h=8, file=paste(edir,'/',timetag,'_RF_impactAcute.pdf',sep=''))
}
if('BILL'%in%colnames(sm))
{
ggplot( subset(sm, TEAM!='MetaPIGA' & TEAM!='PhyML' & grepl('Reg',MODEL) & !grepl('none',GAPS)), aes(y=BILL, x=ACUTE, shape=TEAM, fill=ACUTE, colour=ACUTE) ) +
geom_jitter(aes(size=BEST), position = position_jitter(height=.001, width=0.2), alpha=0.8) +
geom_boxplot(outlier.shape=NA, colour='black', alpha=0.3) +
scale_size_manual(values=c(3, 1)) +
scale_shape_manual(values=c(21,24), guide=FALSE) +
scale_fill_brewer(palette='Set1', guide=FALSE) + scale_colour_brewer(palette='Set1', guide=FALSE) +
facet_grid(GAPS~TEAM+GENE, scales='free_x') +
labs(x='\ntransmissions from those in acute infection', y='Geodesic\n(raw)\n', size='') +
theme_bw()
ggsave(w=10, h=8, file=paste(edir,'/',timetag,'_BL_impactAcute.pdf',sep=''))
}
if('NQD'%in%colnames(sm))
{
ggplot( subset(sm, TEAM!='MetaPIGA' & TEAM!='PhyML' & grepl('Reg',MODEL) & !grepl('none',GAPS)), aes(y=NQD, x=ACUTE, shape=TEAM, fill=ACUTE, colour=ACUTE) ) +
geom_jitter(aes(size=BEST), position = position_jitter(height=.001, width=0.2), alpha=0.8) +
geom_boxplot(outlier.shape=NA, colour='black', alpha=0.3) +
scale_size_manual(values=c(3, 1)) +
scale_shape_manual(values=c(21,24), guide=FALSE) +
scale_fill_brewer(palette='Set1', guide=FALSE) + scale_colour_brewer(palette='Set1', guide=FALSE) +
facet_grid(GAPS~TEAM+GENE, scales='free_x') +
labs(x='\ntransmissions from those in acute infection', y='Quartett\n(standardized)\n', size='') +
theme_bw()
ggsave(w=10, h=8, file=paste(edir,'/',timetag,'_QD_impactAcute.pdf',sep=''))
}
if('KC1'%in%colnames(sa))
{
ggplot( subset(sa, TEAM!='MetaPIGA' & TEAM!='PhyML' & grepl('Reg',MODEL) & !grepl('none',GAPS)), aes(y=KC1/TAXAN/TAXAN, x=ACUTE, shape=TEAM, fill=ACUTE, colour=ACUTE) ) +
geom_jitter(aes(size=BEST), position = position_jitter(height=.001, width=0.2), alpha=0.8) +
geom_boxplot(outlier.shape=NA, colour='black', alpha=0.3) +
scale_size_manual(values=c(3, 1)) +
scale_shape_manual(values=c(21,24), guide=FALSE) +
scale_fill_brewer(palette='Set1', guide=FALSE) + scale_colour_brewer(palette='Set1', guide=FALSE) +
facet_grid(GAPS~TEAM+GENE, scales='free_x') +
labs(x='\ntransmissions from those in acute infection', y='Kendall-Colijn\n(lambda=0, rtt if unrooted, /TX^2)\n', size='') +
theme_bw()
ggsave(w=10, h=8, file=paste(edir,'/',timetag,'_KC1_impactAcute.pdf',sep=''))
ggplot( subset(sa, TEAM!='MetaPIGA' & TEAM!='PhyML' & grepl('Reg',MODEL) & !grepl('none',GAPS)), aes(y=KC2/TAXAN/TAXAN, x=ACUTE, shape=TEAM, fill=ACUTE, colour=ACUTE) ) +
geom_jitter(aes(size=BEST), position = position_jitter(height=.001, width=0.2), alpha=0.8) +
geom_boxplot(outlier.shape=NA, colour='black', alpha=0.3) +
scale_size_manual(values=c(3, 1)) +
scale_shape_manual(values=c(21,24), guide=FALSE) +
scale_fill_brewer(palette='Set1', guide=FALSE) + scale_colour_brewer(palette='Set1', guide=FALSE) +
facet_grid(GAPS~TEAM+GENE, scales='free_x') +
labs(x='\ntransmissions from those in acute infection', y='Kendall-Colijn\n(lambda=0, rtt all, /TX^2)\n', size='') +
theme_bw()
ggsave(w=10, h=8, file=paste(edir,'/',timetag,'_KC2_impactAcute.pdf',sep=''))
}
# effect of ART roll out in terms of RF? --> No
if('NRF'%in%colnames(sa))
{
ggplot( subset(sa, TEAM!='MetaPIGA' & grepl('Vill',MODEL) & !grepl('none',GAPS)), aes(y=NRF, x=ART, shape=TEAM, fill=ART, colour=ART) ) +
geom_jitter(aes(size=BEST), position = position_jitter(height=.001, width=0.2), alpha=0.8) +
geom_boxplot(outlier.shape=NA, colour='black', alpha=0.3) +
scale_size_manual(values=c(3, 1)) +
scale_shape_manual(values=c(21,23,24), guide=FALSE) +
scale_fill_brewer(palette='Set2', guide=FALSE) + scale_colour_brewer(palette='Set2', guide=FALSE) +
facet_grid(GAPS~TEAM+GENE, scales='free_x') +
labs(x='\nART roll-out', y='Robinson-Fould\n(standardized)\n', size='') +
theme_bw()
ggsave(w=10, h=8, file=paste(edir,'/',timetag,'_RF_impactART.pdf',sep=''))
}
if('NQD'%in%colnames(sa))
{
ggplot( subset(sa, TEAM!='MetaPIGA' & grepl('Vill',MODEL) & !grepl('none',GAPS)), aes(y=NQD, x=ART, shape=TEAM, fill=ART, colour=ART) ) +
geom_jitter(aes(size=BEST), position = position_jitter(height=.001, width=0.2), alpha=0.8) +
geom_boxplot(outlier.shape=NA, colour='black', alpha=0.3) +
scale_size_manual(values=c(3, 1)) +
scale_shape_manual(values=c(21,23,24), guide=FALSE) +
scale_fill_brewer(palette='Set2', guide=FALSE) + scale_colour_brewer(palette='Set2', guide=FALSE) +
facet_grid(GAPS~TEAM+GENE, scales='free_x') +
labs(x='\nART roll-out', y='Quartett\n(standardized)\n', size='') +
theme_bw()
ggsave(w=10, h=8, file=paste(edir,'/',timetag,'_QD_impactART.pdf',sep=''))
}
if('BILL'%in%colnames(sm))
{
ggplot( subset(sm, TEAM!='MetaPIGA' & grepl('Vill',MODEL) & !grepl('none',GAPS)), aes(y=BILL, x=ART, shape=TEAM, fill=ART, colour=ART) ) +
geom_jitter(aes(size=BEST), position = position_jitter(height=.001, width=0.2), alpha=0.8) +
geom_boxplot(outlier.shape=NA, colour='black', alpha=0.3) +
scale_size_manual(values=c(3, 1)) +
scale_shape_manual(values=c(21,23,24), guide=FALSE) +
scale_fill_brewer(palette='Set2', guide=FALSE) + scale_colour_brewer(palette='Set2', guide=FALSE) +
facet_grid(GAPS~TEAM+GENE, scales='free_x') +
labs(x='\nART roll-out', y='Geodesic\n(raw)\n', size='') +
theme_bw()
ggsave(w=10, h=8, file=paste(edir,'/',timetag,'_BL_impactART.pdf',sep=''))
}
if('KC1'%in%colnames(sa))
{
ggplot( subset(sa, TEAM!='MetaPIGA' & grepl('Vill',MODEL) & !grepl('none',GAPS)), aes(y=KC1/TAXAN/TAXAN, x=ART, shape=TEAM, fill=ART, colour=ART) ) +
geom_jitter(aes(size=BEST), position = position_jitter(height=.001, width=0.2), alpha=0.8) +
geom_boxplot(outlier.shape=NA, colour='black', alpha=0.3) +
scale_size_manual(values=c(3, 1)) +
scale_shape_manual(values=c(21,23,24), guide=FALSE) +
scale_fill_brewer(palette='Set2', guide=FALSE) + scale_colour_brewer(palette='Set2', guide=FALSE) +
facet_grid(GAPS~TEAM+GENE, scales='free_x') +
labs(x='\nART roll-out', y='Kendall-Colijn\n(lambda=0, rtt if unrooted, /TX^2)\n', size='') +
theme_bw()
ggsave(w=10, h=8, file=paste(edir,'/',timetag,'_KC1_impactART.pdf',sep=''))
ggplot( subset(sa, TEAM!='MetaPIGA' & grepl('Vill',MODEL) & !grepl('none',GAPS)), aes(y=KC2/TAXAN/TAXAN, x=ART, shape=TEAM, fill=ART, colour=ART) ) +
geom_jitter(aes(size=BEST), position = position_jitter(height=.001, width=0.2), alpha=0.8) +
geom_boxplot(outlier.shape=NA, colour='black', alpha=0.3) +
scale_size_manual(values=c(3, 1)) +
scale_shape_manual(values=c(21,23,24), guide=FALSE) +
scale_fill_brewer(palette='Set2', guide=FALSE) + scale_colour_brewer(palette='Set2', guide=FALSE) +
facet_grid(GAPS~TEAM+GENE, scales='free_x') +
labs(x='\nART roll-out', y='Kendall-Colijn\n(lambda=0, rtt all, /TX^2)\n', size='') +
theme_bw()
ggsave(w=10, h=8, file=paste(edir,'/',timetag,'_KC2_impactART.pdf',sep=''))
}
}
treecomparison.submissions.150930<- function()
{
require(data.table)
require(ape)
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/IQTree/IQTree201507'
infiles <- list.files(indir, pattern='treefile$', recursive=1, full.names=1)
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/PhyML'
infiles <- c(infiles, list.files(indir, pattern='*tree*', recursive=1, full.names=1))
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/RAxML'
infiles <- c(infiles, list.files(indir, pattern='*RAxML_bestTree*', recursive=1, full.names=1))
infiles <- data.table(FILE=infiles)
submitted.trees <- lapply(infiles[, FILE], function(x)
{
cat(x)
read.tree(file=x)
})
names(submitted.trees) <- infiles[, FILE]
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/MetaPIGA'
tmp <- list.files(indir, pattern='*result*', recursive=1, full.names=1)
tmp <- data.table(FILE=tmp)
tmp.trees <- lapply(tmp[, FILE], function(x)
{
cat(x)
read.nexus(file=x)
})
sapply(tmp.trees, length)
MetaPIGA.trees <- c(lapply(tmp.trees, '[[', 1), lapply(tmp.trees, '[[', 2), lapply(tmp.trees, '[[', 3), lapply(tmp.trees, '[[', 4))
names(MetaPIGA.trees) <- c(sapply(tmp.trees, function(x) names(x)[1]), sapply(tmp.trees, function(x) names(x)[2]), sapply(tmp.trees, function(x) names(x)[3]), sapply(tmp.trees, function(x) names(x)[4]))
submitted.trees <- c(submitted.trees, MetaPIGA.trees)
submitted.info <- data.table(FILE=names(submitted.trees))
submitted.info[, TEAM:=NA_character_]
set(submitted.info, submitted.info[, which(grepl('RAXML',FILE))], 'TEAM', 'RAXML')
set(submitted.info, submitted.info[, which(grepl('IQTree',FILE))], 'TEAM', 'IQTree')
set(submitted.info, submitted.info[, which(grepl('MetaPIGA',FILE))], 'TEAM', 'MetaPIGA')
set(submitted.info, submitted.info[, which(grepl('PhyML',FILE))], 'TEAM', 'PhyML')
submitted.info[, SC:=NA_character_]
tmp <- submitted.info[, which(grepl('150701_Regional_TRAIN[0-9]', FILE))]
set(submitted.info, tmp, 'SC', submitted.info[tmp, regmatches(FILE, regexpr('150701_Regional_TRAIN[0-9]',FILE))])
tmp <- submitted.info[, which(grepl('150701_Vill_SCENARIO-[A-Z]', FILE))]
set(submitted.info, tmp, 'SC', submitted.info[tmp, regmatches(FILE, regexpr('150701_Vill_SCENARIO-[A-Z]',FILE))])
tmp <- submitted.info[, which(is.na(SC) & grepl('TRAIN[0-9]', FILE))]
set(submitted.info, tmp, 'SC', submitted.info[tmp, paste('150701_Regional_',regmatches(FILE, regexpr('TRAIN[0-9]',FILE)),sep='')])
tmp <- submitted.info[, which(is.na(SC) & grepl('scenario[A-Z]', FILE))]
set(submitted.info, tmp, 'SC', submitted.info[tmp, paste('150701_Vill_',regmatches(FILE, regexpr('scenario[A-Z]',FILE)),sep='')])
tmp <- submitted.info[, which(is.na(SC) & grepl('150701_regional_train[0-9]', FILE))]
set(submitted.info, tmp, 'SC', submitted.info[tmp, regmatches(FILE, regexpr('150701_regional_train[0-9]',FILE))])
tmp <- submitted.info[, which(is.na(SC) & grepl('150701_vill_scenario-[A-Z]', FILE))]
set(submitted.info, tmp, 'SC', submitted.info[tmp, regmatches(FILE, regexpr('150701_vill_scenario-[A-Z]',FILE))])
tmp <- submitted.info[, which(is.na(SC) & grepl('Vill_99_Apr15', FILE))]
set(submitted.info, tmp, 'SC', 'Vill_99_Apr15')
set(submitted.info, NULL, 'SC', submitted.info[, toupper(SC)])
tmp <- submitted.info[, which(grepl('150701_VILL_SCENARIO[A-Z]', SC))]
set(submitted.info, tmp, 'SC', submitted.info[tmp, gsub('150701_VILL_SCENARIO','150701_VILL_SCENARIO-',SC)])
outfile <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/submitted_150911.rda'
save(submitted.trees, submitted.info, file=outfile)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.