prog.hello<- function()
{
print('hello')
file <- '/Users/Oliver/Library/R/2.15/library/PANGEA.HIV.sim/libs/x86_64/PANGEA.HIV.sim.so'
dyn.load(file)
is.loaded('seqgen')
argv <- "-n1 -k1 -on -z42 -mGTR -a1 -g4 -i0 -s1 -f0.25,0.25,0.25,0.25 -r1,1,1,1,1,1 </Users/Oliver/git/HPTN071sim/tmp140914-3/SeqGen/140716_RUN001-3_911_POL_CP3.phy> /Users/Oliver/git/HPTN071sim/tmp140914-3/SeqGen/140716_RUN001-3_911_POL_CP3.seqgen"
z <- strsplit(argv, ' ')[[1]]
z2 <- as.integer(length(z))
.C('seqgen', z2, z)
indir <- '/Users/Oliver/git/HPTN071sim/tmp140914-3/SeqGen'
infile <- '140716_RUN001-3_911_POL_CP3.phy'
outdir <- indir
outfile <- '140716_RUN001-3_911_POL_CP3.seqgen'
cmd.SeqGen(indir, infile, outdir, outfile, prog.args='-n1 -k1 -on -z42', alpha=1, gamma=4, invariable=0, scale=1,
freq.A=0.25, freq.C=0.25, freq.G=0.25, freq.T=0.25,
rate.AC=1, rate.AG=1, rate.AT=1, rate.CG=1, rate.CT=1, rate.GT=1)
}
##--------------------------------------------------------------------------------------------------------
## select between host sequences
##--------------------------------------------------------------------------------------------------------
project.PANGEA.RootSeqSim.DATA.SSAfg.select.between.host<- function()
{
label.sep <- '|'
label.idx.country.id <- 2
label.idx.label <- 3
label.idx.ctime <- 4
#read hand aligned sequences
#DATA <<- '~/git/HPTN071sim/data_rootseq'
infile.gag <- 'PANGEA_SSAfg_gag_140806_n556_final.fasta'
infile.pol <- 'PANGEA_SSAfg_pol_140806_n556_final.fasta'
infile.env <- 'PANGEA_SSAfg_env_140806_n556_final.fasta'
seq.gag <- read.dna( paste(DATA,'/',infile.gag, sep=''), format='fasta' )
seq.pol <- read.dna( paste(DATA,'/',infile.pol, sep=''), format='fasta' )
seq.env <- read.dna( paste(DATA,'/',infile.env, sep=''), format='fasta' )
#check seq names are the same
stopifnot( all(rownames(seq.gag)==rownames(seq.pol)), all(rownames(seq.gag)==rownames(seq.env)) )
#ensure unique seq names
tmp <- paste( rownames(seq.gag), seq_len(nrow(seq.gag)), sep=label.sep )
rownames(seq.gag) <- rownames(seq.pol) <- rownames(seq.env) <- tmp
tmp <- strsplit( rownames(seq.gag), label.sep, fixed=1 )
label.info <- data.table( SEQ_NAME=sapply(tmp, function(x) paste(x, collapse=label.sep)), COUNTRY_ID= sapply(tmp, '[[', label.idx.country.id), LABEL= sapply(tmp, '[[', label.idx.label), CALENDAR_TIME= sapply(tmp, '[[', label.idx.ctime))
label.info <- label.info[, {
list(SEQ_NAME=SEQ_NAME, COUNTRY_ID=COUNTRY_ID, LABEL_UNIQUE=paste(LABEL,seq_along(SEQ_NAME),sep=label.sep), CALENDAR_TIME=CALENDAR_TIME)
}, by=LABEL]
setkey(label.info, SEQ_NAME)
tmp <- label.info[rownames(seq.gag), ][, paste('C',COUNTRY_ID,LABEL_UNIQUE,CALENDAR_TIME,sep=label.sep) ]
rownames(seq.gag) <- rownames(seq.pol) <- rownames(seq.env) <- tmp
set(label.info, NULL, 'SEQ_NAME', label.info[, paste('C',COUNTRY_ID,LABEL_UNIQUE,CALENDAR_TIME,sep=label.sep) ])
#concatenate gag pol env
seq.c <- do.call('cbind', list(seq.gag, seq.pol, seq.env))
#keep only first sequence with same label
tmp <- label.info[, as.integer(sapply( strsplit( LABEL_UNIQUE, label.sep, fixed=1 ), '[[', 2 ))]
label.info[, LABEL_NO:= tmp]
label.info[, SELECT:= FALSE]
#search for further within host seqs: break up label by '.' or '_'
setkey(label.info, LABEL)
tmp <- label.info[, which( grepl('\\.', LABEL) | grepl('_', LABEL) ) ]
#select manually not within host seqs
include <- c( "00BW3876_9", "00BW3886_8", "00BW3891_6", "00BW3970_2", "00BW5031_1","02ET_288","4403bmLwk4_fl7",
"702010141_CH141.w12","702010293_CH293.w8a","702010432_CH432.w4","702010440_CH440.w4","703010085_CH085.w4a",
"703010131_CH131_TF", "703010167_CH167.w8", "703010200_CH200_TFa", "703010228_CH228_TFa", "703010256_CH256.w96",
"703010269_CH269.w24", "704010042_CH042.mo6", "705010067_CH067_TF", "705010162_CH162.mo6", "705010185_CH185.mo6",
"705010198_CH198_TF", "705010534_CH534.w12", "706010164_CH164.mo6", "707010457_CH457.w8", "89SM_145", "90SE_364",
"93MW_965", "96BWMO1_5", "BD16_10", "BD22_11", "BD39_8", "BD9_11", "C.703010159.S.0dps.fl", "C.704010042.S.0dps.fl",
"C.705010162.e.wg2", "C.CAP210.w02.0dps.1_00_F4","C.CAP239.w02.0dps.1_02_F32", "C.CAP45.w02.0dps.1_05_T1", "CAP174_4w", "CAP206_8w_F1",
"CAP210_5w", "CAP228_8w_F2", "CAP229_7w", "CAP239_5w_F1", "CAP244_8w_F1", "CAP248_9w", "CAP255_8w_F1", "CAP256_6w",
"CAP257_7w_F1", "CAP30_5w_F4", "CAP45_5w_F1", "CAP61_8w_F3", "CAP63_5w_F4", "CAP65_6w", "CAP84_3w_F2", "CAP85_5w_F1",
"CAP88_5w_F2", "CAP8_3w_F2", "C_ZA_1069MB", "C_ZA_1184MB", "C_ZA_1189MB", "C_ZA_J112MA", "TV001_patent", "TV002_patent",
"ZM246F_flA1", "ZM247F_flA1", "ZM249M_flC1", "pZAC_R3714")
include <- c(include, label.info[ !grepl('\\.', LABEL) & !grepl('_', LABEL), ][, LABEL])
#these are excluded based on the '.' and '_' search:
#c("4403bmLwk4_fl11","702010293_CH293.w8b","703010200_CH200_TFb","703010200_CH200_TFc", "703010228_CH228_TFb",
# "703010256_CH256_TF", "704010042_CH042_TF", "705010162_CH162_TF", "705010185_CH185_TF", "706010164_CH164_TF",
# "C.CAP210.w02.0dps.1_00_T11", "C.CAP210.w02.0dps.1_00_T2B", "C.CAP210.w02.0dps.1_00_T3", "C.CAP210.w02.0dps.1_00_T36",
# "C.CAP210.w02.0dps.1_00_T3C", "C.CAP210.w02.0dps.1_00_T4", "C.CAP210.w02.0dps.1_00_T43", "C.CAP210.w02.0dps.1_00_T5",
# "C.CAP210.w02.0dps.1_00_T6", "C.CAP210.w05.21dps.2_00", "C.CAP210.w12.70dps.2_05_T13", "C.CAP210.w12.70dps.2_05_T13C",
# "C.CAP210.w12.70dps.2_05_T2", "C.CAP210.w12.70dps.2_05_T39w", "C.CAP210.w12.70dps.2_05_T42", "C.CAP210.w12.70dps.2_05_T5",
# "C.CAP210.w12.70dps.2_05_T8", "C.CAP210.w26.168dps.3_10_T13B", "C.CAP210.w26.168dps.3_10_T20w", "C.CAP210.w26.168dps.3_10_T23B",
# "C.CAP210.w26.168dps.3_10_T24B", "C.CAP210.w26.168dps.3_10_T24C", "C.CAP210.w26.168dps.3_10_T28", "C.CAP210.w26.168dps.3_10_T40",
# "C.CAP210.w26.168dps.3_10_T42", "C.CAP210.w26.168dps.3_10_T43B", "C.CAP210.w26.168dps.3_10_T47", "C.CAP210.w26.168dps.3_10_T49B",
# "C.CAP239.w02.0dps.1_02_T8", "C.CAP239.w05.21dps.2_00", "C.CAP239.w05.21dps.2_00_T11", "C.CAP239.w05.21dps.2_00_T17",
# "C.CAP239.w05.21dps.2_00_T18", "C.CAP239.w05.21dps.2_00_T19", "C.CAP239.w05.21dps.2_00_T21", "C.CAP239.w05.21dps.2_00_T3",
# "C.CAP239.w05.21dps.2_00_T49", "C.CAP239.w05.21dps.2_00_T8", "C.CAP239.w11.63dps.2_05_T37", "C.CAP239.w11.63dps.2_05_T47",
# "C.CAP239.w117.805dps.4_21_T44", "C.CAP239.w117.805dps.4_21_T46", "C.CAP239.w117.805dps.4_21_T50", "C.CAP239.w22.140dps.3_09_F1",
# "C.CAP239.w22.140dps.3_09_T10", "C.CAP239.w22.140dps.3_09_T17", "C.CAP239.w22.140dps.3_09_T20", "C.CAP239.w22.140dps.3_09_T36",
# "C.CAP239.w22.140dps.3_09_T39", "C.CAP239.w22.140dps.3_09_W16", "C.CAP45.w02.0dps.1_05_F3", "C.CAP45.w02.0dps.1_05_T2", "C.CAP45.w05.21dps.2_00",
# "C.CAP45.w05.21dps.2_00_T11", "C.CAP45.w05.21dps.2_00_T12", "C.CAP45.w05.21dps.2_00_T14", "C.CAP45.w05.21dps.2_00_T5", "C.CAP45.w05.21dps.2_00_T9",
# "C.CAP45.w12.70dps.2_05_F1", "C.CAP45.w12.70dps.2_05_T11", "C.CAP45.w12.70dps.2_05_T13b", "C.CAP45.w12.70dps.2_05_T18",
# "C.CAP45.w65.455dps.4_17_T14", "C.CAP45.w65.455dps.4_17_T14B", "ZM246F_flA10", "ZM246F_flA2", "ZM246F_flA6", "ZM246F_flB1",
# "ZM246F_flC12", "ZM246F_flC3" , "ZM246F_flC5", "ZM246F_flC7", "ZM246F_flD5", "ZM247F_flA12", "ZM247F_flA2", "ZM247F_flB8",
# "ZM247F_flB9", "ZM247F_flE10", "ZM247F_flE11", "ZM247F_flE3", "ZM247F_flF10", "ZM247F_flF7", "ZM247F_flG11", "ZM247F_flH1",
# "ZM249M_flC5", "ZM249M_flE10", "ZM249M_flE8", "ZM249M_flF1", "TV001_patent", "TV001_patent", "TV002_patent", "chimeric_MJ4")
exclude <- c( 'C|ZA|C_ZA_1184MB|1|2000','C|ZA|C_ZA_1189MB|1|2000','C|ZA|C_ZA_J112MA|1|2000','C|ZA|C_ZA_1069MB|1|2000',
'C|ZA|TV001_patent|1|1998','C|ZA|TV002_patent|1|1998','C|ZA|03ZASK212B1|1|2003',
'C|ZA|C.CAP239.w02.0dps.1_02_F32|1|2005',"C|ZA|C.CAP210.w02.0dps.1_00_F4|1|2005","C|ZA|C.CAP45.w02.0dps.1_05_T1|1|2005",
'C|MW|C.703010159.S.0dps.fl|1|2007','C|ZA|C.704010042.S.0dps.fl|1|2007','C|ZA|705010162_CH162.mo6|1|2007',
'C|BW|96BW15C05|1|1996','C|BW|96BW15B03|1|1996',
'C|BW|96BW01B22|1|1996','C|BW|96BW01B03|1|1996',"C|BW|96BW11B01|1|1996","C|BW|96BW1104|1|1996",
'C|BW|96BW16B01|1|1996','C|BW|96BW1626|1|1996', "C|BW|96BW0502|1|1996", "C|BW|96BW06H51|1|1996","C|BW|96BW06|1|1996",
"C|BW|96BW0407|1|1996","C|BW|96BW0402|1|1996","C|BW|96BW0410|1|1996","C|BW|96BW0408|1|1996")
set(label.info, label.info[, which( LABEL_NO==1L & LABEL%in%include )], 'SELECT', TRUE)
set(label.info, label.info[, which( SEQ_NAME%in%exclude )], 'SELECT', FALSE)
#select
tmp <- subset(label.info, SELECT)[, SEQ_NAME]
seq.c <- seq.c[tmp, ]
#save the whole lot
tmp <- rownames(seq.c)
seq.gag <- seq.gag[tmp, ]
seq.pol <- seq.pol[tmp, ]
seq.env <- seq.env[tmp, ]
seq <- seq.c #need 'seq' because expected for 3SEQ
outdir <- '~/duke/2014_Gates/methods_comparison_rootseqsim/140811'
outfile <- 'PANGEA_SSAfgBwh_140811_n415_final.R'
file <- paste(outdir, '/', outfile, sep='')
save(seq, seq.gag, seq.pol, seq.env, file=file)
#check for recombinants
}
##--------------------------------------------------------------------------------------------------------
## run 3SEQ
##--------------------------------------------------------------------------------------------------------
project.PANGEA.RootSeqSim.3SEQ.SSAfg.rm.recombinants<- function()
{
require(XML)
require(ape)
require(r3SEQ)
#DATA <<- "/work/or105/Gates_2014"
#DATA <<- '/Users/Oliver/duke/2014_Gates'
indir <- paste(DATA,'methods_comparison_rootseqsim/140811',sep='/')
infile <- 'PANGEA_SSAfgBwh_140811_n415_final.R'
outfile <- 'PANGEA_SSAfgBwhRc-_140811_n390.R'
#
# run 3SEQ
r3seq.pipe.run.3seq(indir, infile, batch.n=5, hpc.walltime=1, hpc.q=NA, hpc.mem="500mb", hpc.nproc=1)
# parse 3SEQ output
argv <<- r3seq.cmd.process.3SEQ.output(indir, infile, '', resume=1, verbose=1)
argv <<- unlist(strsplit(argv,' '))
df.recomb <- r3seq.prog.process.3SEQ.output()
# subset( df.recomb, adjp<1e-4 & min_rec_length>500)[, hist(log10(adjp), breaks=100)]
df.recomb <- subset( df.recomb, adjp<1e-7 & min_rec_length>500)
cat(paste('\nfound potential recombinants, n=',nrow(df.recomb)))
#
file <- paste(indir, '/', infile, sep='')
load(file)
tmp <- setdiff(rownames(seq), df.recomb[, child])
seq.gag <- seq.gag[tmp,]
seq.pol <- seq.pol[tmp,]
seq.env <- seq.env[tmp,]
seq <- seq[tmp,]
file <- paste(indir, '/', outfile, sep='')
save(seq, seq.gag, seq.pol, seq.env, file=file)
}
##--------------------------------------------------------------------------------------------------------
## run BEAST XML file
##--------------------------------------------------------------------------------------------------------
project.PANGEA.RootSeqSim.BEAST.SSAfg.runXML<- function()
{
#DATA <<- "/work/or105/Gates_2014"
#DATA <<- '/Users/Oliver/duke/2014_Gates'
indir <- paste(DATA,'methods_comparison_rootseqsim/140907',sep='/')
#search for XML files in indir
infiles <- list.files(indir, pattern=paste(".xml$",sep=''))
insignat <- ''
hpc.ncpu <- 8
for(infile in infiles)
{
infile <- substr(infile, 1, nchar(infile)-4)
cmd <- hivc.cmd.beast.runxml(indir, infile, insignat, prog.beast=PR.BEAST, prog.beast.opt=" -beagle -working", hpc.tmpdir.prefix="beast", hpc.ncpu=hpc.ncpu)
tmp <- paste(infile,'.timetrees',sep='')
cmd <- paste(cmd, hivc.cmd.beast.read.nexus(indir, tmp, indir, tree.id=NA, method.node.stat='any.node'), sep='\n')
cmd <- paste(cmd, hivc.cmd.beast.run.treeannotator(indir, infile, insignat, prog.beastmcc=PR.BEASTMCC, beastmcc.burnin=500, beastmcc.heights="median"), sep='\n')
cat(cmd)
cmd <- hivc.cmd.hpcwrapper(cmd, hpc.q="pqeph", hpc.nproc=hpc.ncpu, hpc.walltime=91, hpc.mem="3700mb")
outdir <- indir
outfile <- paste("b2m.",paste(strsplit(date(),split=' ')[[1]],collapse='_',sep=''),sep='')
hivc.cmd.hpccaller(outdir, outfile, cmd)
}
}
##--------------------------------------------------------------------------------------------------------
## run BEAST XML file
##--------------------------------------------------------------------------------------------------------
project.PANGEA.RootSeqSim.BEAST.SSApg.runXML<- function()
{
#DATA <<- "/work/or105/Gates_2014"
#DATA <<- '/Users/Oliver/duke/2014_Gates'
indir <- paste(DATA,'methods_comparison_rootseqsim/140902',sep='/')
#search for XML files in indir
infiles <- list.files(indir, pattern=paste(".xml$",sep=''))
insignat <- ''
hpc.ncpu <- 8
for(infile in infiles)
{
infile <- substr(infile, 1, nchar(infile)-4)
cmd <- hivc.cmd.beast.runxml(indir, infile, insignat, prog.beast=PR.BEAST, prog.beast.opt=" -beagle -working", hpc.tmpdir.prefix="beast", hpc.ncpu=hpc.ncpu)
tmp <- paste(infile,'.timetrees',sep='')
cmd <- paste(cmd, hivc.cmd.beast.read.nexus(indir, tmp, indir, tree.id=NA, method.node.stat='any.node'), sep='\n')
cmd <- paste(cmd, hivc.cmd.beast.run.treeannotator(indir, infile, insignat, prog.beastmcc=PR.BEASTMCC, beastmcc.burnin=500, beastmcc.heights="median"), sep='\n')
cat(cmd)
cmd <- hivc.cmd.hpcwrapper(cmd, hpc.q="pqeph", hpc.nproc=hpc.ncpu, hpc.walltime=791, hpc.mem="3700mb")
outdir <- indir
outfile <- paste("bpg.",paste(strsplit(date(),split=' ')[[1]],collapse='_',sep=''),sep='')
hivc.cmd.hpccaller(outdir, outfile, cmd)
}
}
##--------------------------------------------------------------------------------------------------------
## run BEAST XML file
##--------------------------------------------------------------------------------------------------------
project.PANGEA.RootSeqSim.BEAST.SSApg.runXML<- function()
{
#DATA <<- "/work/or105/Gates_2014"
#DATA <<- '/Users/Oliver/duke/2014_Gates'
indir <- paste(DATA,'methods_comparison_rootseqsim/140830',sep='/')
#search for XML files in indir
infiles <- list.files(indir, pattern=paste(".xml$",sep=''))
insignat <- ''
hpc.ncpu <- 8
for(infile in infiles)
{
infile <- substr(infile, 1, nchar(infile)-4)
cmd <- hivc.cmd.beast.runxml(indir, infile, insignat, prog.beast=PR.BEAST, prog.beast.opt=" -beagle -working", hpc.tmpdir.prefix="beast", hpc.ncpu=hpc.ncpu)
tmp <- paste(infile,'.timetrees',sep='')
cmd <- paste(cmd, hivc.cmd.beast.read.nexus(indir, tmp, indir, tree.id=NA, method.node.stat='any.node'), sep='\n')
cmd <- paste(cmd, hivc.cmd.beast.run.treeannotator(indir, infile, insignat, prog.beastmcc=PR.BEASTMCC, beastmcc.burnin=500, beastmcc.heights="median"), sep='\n')
cat(cmd)
cmd <- hivc.cmd.hpcwrapper(cmd, hpc.q="pqeph", hpc.nproc=hpc.ncpu, hpc.walltime=91, hpc.mem="3700mb")
outdir <- indir
outfile <- paste("b2m.",paste(strsplit(date(),split=' ')[[1]],collapse='_',sep=''),sep='')
hivc.cmd.hpccaller(outdir, outfile, cmd)
}
}
##--------------------------------------------------------------------------------------------------------
## create BEAST XML file for each gene
##--------------------------------------------------------------------------------------------------------
project.PANGEA.RootSeqSim.BEAST.SSApg.createXML<- function()
{
require(hivclust)
require(XML)
require(ape)
require(r3SEQ)
#DATA <<- "/work/or105/Gates_2014"
#DATA <<- '/Users/Oliver/duke/2014_Gates'
s.seed <- 42
if(1)
{
#
# to define sequences for each BEAST run
# compute NJ tree and define clusters
#
# v09: estimate frequencies
# v10: empirical frequencies -- chain gets 'stuck' in better lkl for particular freq for A/T
# center evol rate on previous estimates (Wawer, Delatorre) Normal( 1955, with 95%CI 1935-1975-> sigma=12 )
# evol rate prior U(0.0001-0.006)
infile.beast.gag<- '/Users/Oliver/git/HPTN071sim/data_rootseq/BEAST_template_v10gag.xml'
infile.beast.pol<- '/Users/Oliver/git/HPTN071sim/data_rootseq/BEAST_template_v10pol.xml'
infile.beast.env<- '/Users/Oliver/git/HPTN071sim/data_rootseq/BEAST_template_v10env.xml'
indir <- paste(DATA,'methods_comparison_rootseqsim/140907',sep='/')
infile <- 'PANGEA_SSAfgBwhRc-_140811_n390.R'
file <- paste(indir, '/', infile, sep='')
load(file)
# remove sequences without calendar time
label.sep <- '|'
label.idx.ctime <- 5
tmp <- sapply( strsplit( rownames(seq.gag), label.sep, fixed=1 ), '[[', label.idx.ctime )
tmp <- rownames(seq.gag)[ which(is.na(as.numeric(tmp))) ]
cat(paste('\nExclude sequences with no calendar date, ', paste(tmp, collapse=' ')))
tmp <- setdiff(rownames(seq.gag), tmp)
seq.gag <- seq.gag[tmp,]
# exclude last 2 nucleotides in gag to avoid incomple AA
seq.gag <- seq.gag[,1:1440]
seq.pol <- seq.pol[tmp,]
seq.env <- seq.env[tmp,]
seq <- seq[tmp,]
# get NJ tree and plot
tmp <- dist.dna( seq, model='raw' )
seq.ph <- nj(tmp)
file <- paste( indir, '/', substr(infile,1,nchar(infile)-2), '_njtree.pdf', sep='' )
pdf(file=file, w=10, h=80)
plot(seq.ph, show.tip=TRUE)
dev.off()
#
# get 3 sequence pools of equal size
#
set.seed(s.seed)
pool.n <- 3
tmp <- hivc.clu.brdist.stats(seq.ph, eval.dist.btw="leaf", stat.fun=hivc.clu.min.transmission.cascade)
thresh.brl <- 0.055
clustering <- hivc.clu.clusterbythresh(seq.ph, thresh.brl=thresh.brl, dist.brl=tmp, retval="all")
# allocate clustering tips into 3 distinct clusters
seq.clumem <- data.table( PH_NODE_ID=seq_len(Ntip(seq.ph)), CLU_ID=clustering$clu.mem[ seq_len(Ntip(seq.ph)) ] )
setkey(seq.clumem, CLU_ID)
tmp <- which(!is.na(seq.clumem[, CLU_ID]))
tmp <- seq.clumem[tmp,][, list(CLU_N=-length(PH_NODE_ID)), by='CLU_ID']
setkey(tmp, CLU_N)
set(tmp, NULL, 'POOL_ID', tmp[, cumsum(-CLU_N)])
set(tmp, NULL, 'POOL_ID', tmp[, ceiling( POOL_ID / max(POOL_ID) * pool.n ) ] )
seq.clumem <- merge(seq.clumem, subset(tmp, select=c(CLU_ID, POOL_ID)), by='CLU_ID', all.x=TRUE)
# allocate non-clustering tips into 3 distinct clusters
tmp <- subset(seq.clumem,!is.na(POOL_ID))[, list(NOCLU_N= ceiling( nrow(seq.clumem) / pool.n ) - length(PH_NODE_ID)), by='POOL_ID']
set(tmp, 1L, 'NOCLU_N', tmp[1,NOCLU_N] - ( tmp[, sum(NOCLU_N)] - ( Ntip(seq.ph) - nrow(subset(seq.clumem,!is.na(POOL_ID))) )) )
set(seq.clumem, seq.clumem[, which(is.na(POOL_ID))], 'POOL_ID', rep(tmp[,POOL_ID], tmp[,NOCLU_N]) )
seq.clumem[, table(POOL_ID)]
#
# for each sequence pool, set up BEAST run
#
verbose <- 1
bxml.template.gag <- xmlTreeParse(infile.beast.gag, useInternalNodes=TRUE, addFinalizer = TRUE)
bxml.template.pol <- xmlTreeParse(infile.beast.pol, useInternalNodes=TRUE, addFinalizer = TRUE)
bxml.template.env <- xmlTreeParse(infile.beast.env, useInternalNodes=TRUE, addFinalizer = TRUE)
for(pool.id in seq_len(pool.n))
{
pool.seqnames <- seq.ph$tip.label[ subset(seq.clumem, POOL_ID==pool.id)[, PH_NODE_ID] ]
#
#
#
cat(paste('\ncreate GAG BEAST XML file for seqs=',paste(pool.seqnames, collapse=' ')))
pool.infile <- paste( substr(infile,1,nchar(infile)-2),'_geneGAG_pool',pool.id, sep='' )
# write XML file with new sequences
bxml <- newXMLDoc(addFinalizer=T)
bxml.beast <- newXMLNode("beast", doc=bxml, addFinalizer=T)
tmp <- newXMLCommentNode(text=paste("Generated by HIVCLUST from template",infile.beast.gag), parent=bxml.beast, doc=bxml, addFinalizer=T)
# add new set of GAG sequences into GAG alignment
tmp <- seq.gag[pool.seqnames,]
bxml <- hivc.beast.add.seq(bxml, tmp, df=NULL, beast.label.datepos= 5, beast.label.sep= '|', beast.date.direction= "forwards", beast.date.units= "years", beast.alignment.id="GAG.alignment", beast.alignment.dataType= "nucleotide", verbose=1)
# copy from template
bt.beast <- getNodeSet(bxml.template.gag, "//beast")[[1]]
dummy <- sapply(seq.int( 1, xmlSize(bt.beast) ), function(i)
{
if( class(bt.beast[[i]])[1]=="XMLInternalCommentNode" )
dummy<- newXMLCommentNode(text=xmlValue(bt.beast[[i]]), parent=bxml.beast, doc=bxml, addFinalizer=T)
else
dummy<- addChildren( bxml.beast, xmlClone( bt.beast[[i]], addFinalizer=T, doc=bxml ) )
})
# change gmrf dimensions
tmp <- getNodeSet(bxml, "//*[@id='skyride.logPopSize']")
if(length(tmp)!=1) stop("unexpected number of *[@id='skyride.logPopSize'")
tmp <- tmp[[1]]
xmlAttrs(tmp)["dimension"] <- length(pool.seqnames)-1
tmp <- getNodeSet(bxml, "//*[@id='skyride.groupSize']")
if(length(tmp)!=1) stop("unexpected number of *[@id='skyride.groupSize'")
tmp <- tmp[[1]]
xmlAttrs(tmp)["dimension"] <- length(pool.seqnames)-1
# change outfile name
bxml.onodes <- getNodeSet(bxml, "//*[@fileName]")
tmp <- sapply(bxml.onodes, function(x) xmlGetAttr(x,"fileName"))
tmp <- gsub("(time).","time",tmp,fixed=1)
tmp <- gsub("(subst).","subst",tmp,fixed=1)
tmp <- sapply(strsplit(tmp,'.',fixed=1), function(x) paste(pool.infile, '.', tail(x,1), sep=''))
dummy <- sapply(seq_along(bxml.onodes), function(i){ xmlAttrs(bxml.onodes[[i]])["fileName"]<- tmp[i] })
# write to file
file <- paste(indir,'/',pool.infile,".xml", sep='')
if(verbose) cat(paste("\nwrite xml file to",file))
saveXML(bxml, file=file)
#
# POL
#
cat(paste('\ncreate POL BEAST XML file for seqs=',paste(pool.seqnames, collapse=' ')))
pool.infile <- paste( substr(infile,1,nchar(infile)-2),'_genePOL_pool',pool.id, sep='' )
# write XML file with new sequences
bxml <- newXMLDoc(addFinalizer=T)
bxml.beast <- newXMLNode("beast", doc=bxml, addFinalizer=T)
tmp <- newXMLCommentNode(text=paste("Generated by HIVCLUST from template",infile.beast.pol), parent=bxml.beast, doc=bxml, addFinalizer=T)
# add new set of GAG sequences into GAG alignment
tmp <- seq.pol[pool.seqnames,]
bxml <- hivc.beast.add.seq(bxml, tmp, df=NULL, beast.label.datepos= 5, beast.label.sep= '|', beast.date.direction= "forwards", beast.date.units= "years", beast.alignment.id="POL.alignment", beast.alignment.dataType= "nucleotide", verbose=1)
# copy from template
bt.beast <- getNodeSet(bxml.template.pol, "//beast")[[1]]
dummy <- sapply(seq.int( 1, xmlSize(bt.beast) ), function(i)
{
if( class(bt.beast[[i]])[1]=="XMLInternalCommentNode" )
dummy<- newXMLCommentNode(text=xmlValue(bt.beast[[i]]), parent=bxml.beast, doc=bxml, addFinalizer=T)
else
dummy<- addChildren( bxml.beast, xmlClone( bt.beast[[i]], addFinalizer=T, doc=bxml ) )
})
# change gmrf dimensions
tmp <- getNodeSet(bxml, "//*[@id='skyride.logPopSize']")
if(length(tmp)!=1) stop("unexpected number of *[@id='skyride.logPopSize'")
tmp <- tmp[[1]]
xmlAttrs(tmp)["dimension"] <- length(pool.seqnames)-1
tmp <- getNodeSet(bxml, "//*[@id='skyride.groupSize']")
if(length(tmp)!=1) stop("unexpected number of *[@id='skyride.groupSize'")
tmp <- tmp[[1]]
xmlAttrs(tmp)["dimension"] <- length(pool.seqnames)-1
# change outfile name
bxml.onodes <- getNodeSet(bxml, "//*[@fileName]")
tmp <- sapply(bxml.onodes, function(x) xmlGetAttr(x,"fileName"))
tmp <- gsub("(time).","time",tmp,fixed=1)
tmp <- gsub("(subst).","subst",tmp,fixed=1)
tmp <- sapply(strsplit(tmp,'.',fixed=1), function(x) paste(pool.infile, '.', tail(x,1), sep=''))
dummy <- sapply(seq_along(bxml.onodes), function(i){ xmlAttrs(bxml.onodes[[i]])["fileName"]<- tmp[i] })
# write to file
file <- paste(indir,'/',pool.infile,".xml", sep='')
if(verbose) cat(paste("\nwrite xml file to",file))
saveXML(bxml, file=file)
#
# ENV
#
cat(paste('\ncreate ENV BEAST XML file for seqs=',paste(pool.seqnames, collapse=' ')))
pool.infile <- paste( substr(infile,1,nchar(infile)-2),'_geneENV_pool',pool.id, sep='' )
# write XML file with new sequences
bxml <- newXMLDoc(addFinalizer=T)
bxml.beast <- newXMLNode("beast", doc=bxml, addFinalizer=T)
tmp <- newXMLCommentNode(text=paste("Generated by HIVCLUST from template",infile.beast.env), parent=bxml.beast, doc=bxml, addFinalizer=T)
# add new set of GAG sequences into GAG alignment
tmp <- seq.env[pool.seqnames,]
bxml <- hivc.beast.add.seq(bxml, tmp, df=NULL, beast.label.datepos= 5, beast.label.sep= '|', beast.date.direction= "forwards", beast.date.units= "years", beast.alignment.id="ENV.alignment", beast.alignment.dataType= "nucleotide", verbose=1)
# copy from template
bt.beast <- getNodeSet(bxml.template.env, "//beast")[[1]]
dummy <- sapply(seq.int( 1, xmlSize(bt.beast) ), function(i)
{
if( class(bt.beast[[i]])[1]=="XMLInternalCommentNode" )
dummy<- newXMLCommentNode(text=xmlValue(bt.beast[[i]]), parent=bxml.beast, doc=bxml, addFinalizer=T)
else
dummy<- addChildren( bxml.beast, xmlClone( bt.beast[[i]], addFinalizer=T, doc=bxml ) )
})
# change gmrf dimensions
tmp <- getNodeSet(bxml, "//*[@id='skyride.logPopSize']")
if(length(tmp)!=1) stop("unexpected number of *[@id='skyride.logPopSize'")
tmp <- tmp[[1]]
xmlAttrs(tmp)["dimension"] <- length(pool.seqnames)-1
tmp <- getNodeSet(bxml, "//*[@id='skyride.groupSize']")
if(length(tmp)!=1) stop("unexpected number of *[@id='skyride.groupSize'")
tmp <- tmp[[1]]
xmlAttrs(tmp)["dimension"] <- length(pool.seqnames)-1
# change outfile name
bxml.onodes <- getNodeSet(bxml, "//*[@fileName]")
tmp <- sapply(bxml.onodes, function(x) xmlGetAttr(x,"fileName"))
tmp <- gsub("(time).","time",tmp,fixed=1)
tmp <- gsub("(subst).","subst",tmp,fixed=1)
tmp <- sapply(strsplit(tmp,'.',fixed=1), function(x) paste(pool.infile, '.', tail(x,1), sep=''))
dummy <- sapply(seq_along(bxml.onodes), function(i){ xmlAttrs(bxml.onodes[[i]])["fileName"]<- tmp[i] })
# write to file
file <- paste(indir,'/',pool.infile,".xml", sep='')
if(verbose) cat(paste("\nwrite xml file to",file))
saveXML(bxml, file=file)
}
}
#
}
##--------------------------------------------------------------------------------------------------------
## create BEAST XML file
##--------------------------------------------------------------------------------------------------------
project.PANGEA.RootSeqSim.BEAST.SSAfg.createXML<- function()
{
require(hivclust)
require(XML)
require(ape)
require(r3SEQ)
#DATA <<- "/work/or105/Gates_2014"
#DATA <<- '/Users/Oliver/duke/2014_Gates'
s.seed <- 42
if(1)
{
#
# to define sequences for each BEAST run
# compute NJ tree and define clusters
#
infile.beast <- '/Users/Oliver/git/HPTN071sim/data_rootseq/BEAST_template_v09.xml'
indir <- paste(DATA,'methods_comparison_rootseqsim/140830',sep='/')
infile <- 'PANGEA_SSAfgBwhRc-_140811_n390.R'
file <- paste(indir, '/', infile, sep='')
load(file)
# remove sequences without calendar time
label.sep <- '|'
label.idx.ctime <- 5
tmp <- sapply( strsplit( rownames(seq.gag), label.sep, fixed=1 ), '[[', label.idx.ctime )
tmp <- rownames(seq.gag)[ which(is.na(as.numeric(tmp))) ]
cat(paste('\nExclude sequences with no calendar date, ', paste(tmp, collapse=' ')))
tmp <- setdiff(rownames(seq.gag), tmp)
seq.gag <- seq.gag[tmp,]
# exclude last 2 nucleotides in gag to avoid incomple AA
seq.gag <- seq.gag[,1:1440]
seq.pol <- seq.pol[tmp,]
seq.env <- seq.env[tmp,]
seq <- seq[tmp,]
# get NJ tree and plot
tmp <- dist.dna( seq, model='raw' )
seq.ph <- nj(tmp)
file <- paste( indir, '/', substr(infile,1,nchar(infile)-2), '_njtree.pdf', sep='' )
pdf(file=file, w=10, h=80)
plot(seq.ph, show.tip=TRUE)
dev.off()
#
# get 3 sequence pools of equal size
#
set.seed(s.seed)
pool.n <- 3
tmp <- hivc.clu.brdist.stats(seq.ph, eval.dist.btw="leaf", stat.fun=hivc.clu.min.transmission.cascade)
thresh.brl <- 0.055
clustering <- hivc.clu.clusterbythresh(seq.ph, thresh.brl=thresh.brl, dist.brl=tmp, retval="all")
# allocate clustering tips into 3 distinct clusters
seq.clumem <- data.table( PH_NODE_ID=seq_len(Ntip(seq.ph)), CLU_ID=clustering$clu.mem[ seq_len(Ntip(seq.ph)) ] )
setkey(seq.clumem, CLU_ID)
tmp <- which(!is.na(seq.clumem[, CLU_ID]))
tmp <- seq.clumem[tmp,][, list(CLU_N=-length(PH_NODE_ID)), by='CLU_ID']
setkey(tmp, CLU_N)
set(tmp, NULL, 'POOL_ID', tmp[, cumsum(-CLU_N)])
set(tmp, NULL, 'POOL_ID', tmp[, ceiling( POOL_ID / max(POOL_ID) * pool.n ) ] )
seq.clumem <- merge(seq.clumem, subset(tmp, select=c(CLU_ID, POOL_ID)), by='CLU_ID', all.x=TRUE)
# allocate non-clustering tips into 3 distinct clusters
tmp <- subset(seq.clumem,!is.na(POOL_ID))[, list(NOCLU_N= ceiling( nrow(seq.clumem) / pool.n ) - length(PH_NODE_ID)), by='POOL_ID']
set(tmp, 1L, 'NOCLU_N', tmp[1,NOCLU_N] - ( tmp[, sum(NOCLU_N)] - ( Ntip(seq.ph) - nrow(subset(seq.clumem,!is.na(POOL_ID))) )) )
set(seq.clumem, seq.clumem[, which(is.na(POOL_ID))], 'POOL_ID', rep(tmp[,POOL_ID], tmp[,NOCLU_N]) )
seq.clumem[, table(POOL_ID)]
#
# for each sequence pool, set up BEAST run
#
verbose <- 1
bxml.template <- xmlTreeParse(infile.beast, useInternalNodes=TRUE, addFinalizer = TRUE)
for(pool.id in seq_len(pool.n))
{
pool.infile <- paste( substr(infile,1,nchar(infile)-2),'_pool',pool.id, sep='' )
pool.seqnames <- seq.ph$tip.label[ subset(seq.clumem, POOL_ID==pool.id)[, PH_NODE_ID] ]
#
if(0)
{
cat(paste('\ncreate FASTA file for seqs=',paste(pool.seqnames, collapse=' ')))
file <- paste(indir,'/',pool.infile,"_GAG.fa", sep='')
write.dna(seq.gag[pool.seqnames,], format='fasta', file=file)
file <- paste(indir,'/',pool.infile,"_POL.fa", sep='')
write.dna(seq.pol[pool.seqnames,], format='fasta', file=file)
file <- paste(indir,'/',pool.infile,"_ENV.fa", sep='')
write.dna(seq.env[pool.seqnames,], format='fasta', file=file)
tmp <- do.call('cbind',list( seq.gag[pool.seqnames,],seq.pol[pool.seqnames,],seq.env[pool.seqnames,] ))
file <- paste(indir,'/',pool.infile,".fa", sep='')
write.dna(tmp, format='fasta', file=file)
}
#
cat(paste('\ncreate BEAST XML file for seqs=',paste(pool.seqnames, collapse=' ')))
# write XML file with new sequences
bxml <- newXMLDoc(addFinalizer=T)
bxml.beast <- newXMLNode("beast", doc=bxml, addFinalizer=T)
newXMLCommentNode(text=paste("Generated by HIVCLUST from template",infile.beast), parent=bxml.beast, doc=bxml, addFinalizer=T)
# add new set of ENV sequences into alignment ID 1
tmp <- seq.env[pool.seqnames,]
bxml <- hivc.beast.add.seq(bxml, tmp, df=NULL, beast.label.datepos= 5, beast.label.sep= '|', beast.date.direction= "forwards", beast.date.units= "years", beast.alignment.id="alignment1", beast.alignment.dataType= "nucleotide", verbose=1)
# add new set of GAG sequences into alignment ID 2
tmp <- seq.gag[pool.seqnames,]
bxml <- hivc.beast.add.seq(bxml, tmp, df=NULL, beast.label.datepos= 5, beast.label.sep= '|', beast.date.direction= "forwards", beast.date.units= "years", beast.alignment.id="alignment2", beast.alignment.dataType= "nucleotide", verbose=1)
# add new set of POL sequences into alignment ID 3
tmp <- seq.pol[pool.seqnames,]
bxml <- hivc.beast.add.seq(bxml, tmp, df=NULL, beast.label.datepos= 5, beast.label.sep= '|', beast.date.direction= "forwards", beast.date.units= "years", beast.alignment.id="alignment3", beast.alignment.dataType= "nucleotide", verbose=1)
# copy from template
bt.beast <- getNodeSet(bxml.template, "//beast")[[1]]
dummy <- sapply(seq.int( 1, xmlSize(bt.beast) ), function(i)
{
if( class(bt.beast[[i]])[1]=="XMLInternalCommentNode" )
dummy<- newXMLCommentNode(text=xmlValue(bt.beast[[i]]), parent=bxml.beast, doc=bxml, addFinalizer=T)
else
dummy<- addChildren( bxml.beast, xmlClone( bt.beast[[i]], addFinalizer=T, doc=bxml ) )
})
# change gmrf dimensions
tmp <- getNodeSet(bxml, "//*[@id='skyride.logPopSize']")
if(length(tmp)!=1) stop("unexpected number of *[@id='skyride.logPopSize'")
tmp <- tmp[[1]]
xmlAttrs(tmp)["dimension"] <- length(pool.seqnames)-1
tmp <- getNodeSet(bxml, "//*[@id='skyride.groupSize']")
if(length(tmp)!=1) stop("unexpected number of *[@id='skyride.groupSize'")
tmp <- tmp[[1]]
xmlAttrs(tmp)["dimension"] <- length(pool.seqnames)-1
# change outfile name
bxml.onodes <- getNodeSet(bxml, "//*[@fileName]")
tmp <- sapply(bxml.onodes, function(x) xmlGetAttr(x,"fileName"))
tmp <- gsub("(time).","time",tmp,fixed=1)
tmp <- gsub("(subst).","subst",tmp,fixed=1)
tmp <- sapply(strsplit(tmp,'.',fixed=1), function(x) paste(pool.infile, '.', tail(x,1), sep=''))
dummy <- sapply(seq_along(bxml.onodes), function(i){ xmlAttrs(bxml.onodes[[i]])["fileName"]<- tmp[i] })
# write to file
file <- paste(indir,'/',pool.infile,".xml", sep='')
if(verbose) cat(paste("\nwrite xml file to",file))
saveXML(bxml, file=file)
}
}
#
}
##--------------------------------------------------------------------------------------------------------
## process BEAST log file and extract GTR parameters
##--------------------------------------------------------------------------------------------------------
project.PANGEA.RootSeqSim.BEAST.SSAfg.getGTR<- function()
{
tree.id.burnin <- 2e7
tree.id.labelsep <- '|'
dir.name <- '/Users/Oliver/duke/2014_Gates'
indir <- paste(dir.name,'methods_comparison_rootseqsim/140813',sep='/')
outdir <- indir
# search for BEAST output
infiles <- list.files(indir)
infiles <- infiles[ sapply(infiles, function(x) grepl('pool[0-9].log$',x) ) ]
# collect log variables
log.df <- lapply(seq_along(infiles), function(i)
{
infile <- infiles[i]
cat(paste('\nprocess file', infile))
file <- paste(indir, '/', infile, sep='')
df <- as.data.table(read.delim(file, comment.char='#'))
cat(paste('\nignore logs for\n',paste(colnames(df)[ !grepl('state|POL|GAG|ENV|ucld|meanRate',colnames(df)) ], collapse=', ') ))
df <- subset(df, select=which(grepl('state|POL|GAG|ENV|ucld|meanRate',colnames(df))))
log.df <- c( paste('frequencies',1:4,sep=''), 'mu', 'alpha', '\\.at', '\\.ac', '\\.cg', '\\.ag', '\\.gt', 'treeLikelihood' )
log.df <- lapply( log.df, function(x)
{
tmp <- melt( subset(df, select=which(grepl(paste('state|',x,sep=''),colnames(df)))), id='state', value.name=x)
tmp[, GENE:= tmp[,substr(variable, 1, 3)]]
tmp[, CODON_POS:=tmp[, regmatches(variable, regexpr('CP[1-3]',variable))]]
subset(tmp, select=which(colnames(tmp)!='variable'))
})
tmp <- log.df[[1]]
for(j in seq_along(log.df)[-1])
tmp <- merge(tmp, log.df[[j]], by=c('state','GENE','CODON_POS'))
log.df <- tmp
log.df <- merge(log.df, subset(df, select=which(grepl('state|ucld|meanRate',colnames(df)))), by='state')
log.df[, FILE:= regmatches(infile, regexpr('pool[0-9]+',infile))]
log.df
})
log.df <- do.call('rbind',log.df)
setnames(log.df, colnames(log.df), gsub('\\.','',colnames(log.df),fixed=TRUE))
setnames(log.df, paste('frequencies',1:4,sep=''), c('a','c','g','t') )
log.df <- subset(log.df, state>tree.id.burnin)
# check mean of relative rates
#tmp <- log.df[, list(mmu=mean(mu), n=length(mu)), by=c('FILE','GENE', 'state')]
file <- paste( substr(infiles[1],1,nchar(infiles[1])-9),'log.R',sep='' )
file <- paste( outdir, '/', file, sep='')
cat(paste('\nsave to file', file))
save(file=file, log.df)
#
tmp <- copy(log.df)
tmp <- melt(tmp, id=c('state','GENE','CODON_POS','FILE'))
ggplot( tmp, aes(x=value, fill=FILE)) + geom_histogram() + facet_grid(GENE+CODON_POS~variable, scales='free')
file <- paste( substr(file,1,nchar(file)-1),'pdf',sep='' )
ggsave(file, h=15, w=20)
}
##--------------------------------------------------------------------------------------------------------
## process BEAST log file and extract GTR parameters
##--------------------------------------------------------------------------------------------------------
project.PANGEA.RootSeqSim.BEAST.SSApg.getGTR<- function()
{
require(XML)
tree.id.burnin <- 2e7
tree.id.labelsep <- '|'
dir.name <- '/Users/Oliver/duke/2014_Gates'
indir <- paste(dir.name,'methods_comparison_rootseqsim/140907',sep='/')
outdir <- indir
# compute frequencies from BEAST xml
infiles <- list.files(indir)
infiles <- infiles[ sapply(infiles, function(x) grepl('pool[0-9].xml$',x) ) ]
freq <- lapply(seq_along(infiles), function(i)
{
infile <- infiles[i]
cat(paste('\nprocess file', infile))
file <- paste(indir, '/', infile, sep='')
bxml <- xmlTreeParse(file, useInternalNodes=TRUE, addFinalizer = TRUE)
bseq <- hivc.beast.get.sequences(bxml, verbose=1)
tmp <- tolower(do.call('rbind',strsplit(bseq[, SEQ],'')))
bseq.CP1 <- as.DNAbin( tmp[, seq.int(1, ncol(tmp), by=3)] )
bseq.CP2 <- as.DNAbin( tmp[, seq.int(2, ncol(tmp), by=3)] )
bseq.CP3 <- as.DNAbin( tmp[, seq.int(3, ncol(tmp), by=3)] )
bseq[1, substr(ALIGNMENT_ID,1,3)]
tmp <- list( data.table(FREQ=c('a','c','g','t'), VALUE=base.freq( bseq.CP1 ), CODON_POS='CP1', GENE=bseq[1, substr(ALIGNMENT_ID,1,3)]),
data.table(FREQ=c('a','c','g','t'), VALUE=base.freq( bseq.CP2 ), CODON_POS='CP2', GENE=bseq[1, substr(ALIGNMENT_ID,1,3)]),
data.table(FREQ=c('a','c','g','t'), VALUE=base.freq( bseq.CP3 ), CODON_POS='CP3', GENE=bseq[1, substr(ALIGNMENT_ID,1,3)]) )
tmp <- do.call('rbind', tmp)
tmp[, FILE:=regmatches(infile, regexpr('pool[0-9]+',infile))]
tmp
})
freq <- do.call('rbind', freq)
freq <- dcast.data.table(freq, CODON_POS+GENE+FILE~FREQ, value.var='VALUE')
# search for BEAST output
infiles <- list.files(indir)
infiles <- infiles[ sapply(infiles, function(x) grepl('pool[0-9].log$',x) ) ]
# collect log variables
log.df <- lapply(seq_along(infiles), function(i)
{
infile <- infiles[i]
cat(paste('\nprocess file', infile))
file <- paste(indir, '/', infile, sep='')
df <- as.data.table(read.delim(file, comment.char='#'))
cat(paste('\nignore logs for\n',paste(colnames(df)[ !grepl('state|POL|GAG|ENV|ucld|meanRate|coefficientOfVariation|treeModel.rootHeight',colnames(df)) ], collapse=', ') ))
df <- subset(df, select=which(grepl('state|POL|GAG|ENV|ucld|meanRate|coefficientOfVariation|treeModel.rootHeight',colnames(df))))
#log.df <- c( paste('frequencies',1:4,sep=''), 'mu', 'alpha', '\\.at', '\\.ac', '\\.cg', '\\.ag', '\\.gt' )
log.df <- c( 'mu', 'alpha', '\\.at', '\\.ac', '\\.cg', '\\.ag', '\\.gt' )
log.df <- lapply( log.df, function(x)
{
tmp <- melt( subset(df, select=which(grepl(paste('state|',x,sep=''),colnames(df)))), id='state', value.name=x)
tmp[, GENE:= tmp[,substr(variable, 1, 3)]]
tmp[, CODON_POS:=tmp[, regmatches(variable, regexpr('CP[1-3]',variable))]]
subset(tmp, select=which(colnames(tmp)!='variable'))
})
tmp <- log.df[[1]]
for(j in seq_along(log.df)[-1])
tmp <- merge(tmp, log.df[[j]], by=c('state','GENE','CODON_POS'))
log.df <- tmp
log.df <- merge(log.df, subset(df, select=which(grepl('state|ucld|meanRate|coefficientOfVariation|treeModel.rootHeight',colnames(df)))), by='state')
log.df[, FILE:= regmatches(infile, regexpr('pool[0-9]+',infile))]
log.df
})
log.df <- do.call('rbind',log.df)
setnames(log.df, colnames(log.df), gsub('\\.','',colnames(log.df),fixed=TRUE))
# handle empirical freq
if(any(grepl('frequencies',colnames(log.df))))
setnames(log.df, paste('frequencies',1:4,sep=''), c('a','c','g','t') )
if(all(!grepl('frequencies',colnames(log.df))))
{
log.df <- merge(log.df, freq, by=c('GENE','CODON_POS','FILE'))
}
log.df <- subset(log.df, state>tree.id.burnin)
# check mean of relative rates
tmp <- log.df[, list(mmu=mean(mu), n=length(mu)), by=c('FILE','GENE', 'state')]
stopifnot( nrow(subset(tmp, abs(mmu-1)>2*EPS))==0 )
#
file <- paste( substr(infiles[1],1,nchar(infiles[1])-9),'log.R',sep='' )
file <- "PANGEA_SSAfgBwhRc-_140907_n390_log.R"
file <- paste( outdir, '/', file, sep='')
cat(paste('\nsave to file', file))
save(file=file, log.df)
#
tmp <- copy(log.df)
tmp <- melt(tmp, id=c('state','GENE','CODON_POS','FILE'))
ggplot( tmp, aes(x=value, fill=FILE)) + geom_histogram() + facet_grid(GENE+CODON_POS~variable, scales='free')
file <- paste( substr(file,1,nchar(file)-1),'pdf',sep='' )
ggsave(file, h=15, w=20)
}
##--------------------------------------------------------------------------------------------------------
## get ancestral sequences from BEAST XML
##--------------------------------------------------------------------------------------------------------
project.PANGEA.RootSeqSim.BEAST.SSApg.getancestralseq.from.output<- function()
{
tree.id.burnin <- 2e7
tree.id.labelsep <- '|'
ancseq.excl.timediff<- 3
dir.name <- '/Users/Oliver/duke/2014_Gates'
indir <- paste(dir.name,'methods_comparison_rootseqsim/140907',sep='/')
ancseq.label.prefix <- 'PANGEA_SSApgBwhRc-_140907_n390'
outdir <- indir
# search for BEAST output
files <- list.files(indir)
files <- files[ sapply(files, function(x) grepl('pool[0-9].R$',x) ) ]
if(!length(files)) stop('cannot find files matching criteria')
# load and process BEAST PARSER output
# sampling times are different for each gene, as they come from different trees
anc.seq <- lapply(files, function(file)
{
cat(paste('\nProcess file=', file ))
load( paste(indir, file, sep='/') ) # expect tree, node.stat
# compute gag pol env ancestral sequences
anc.seq <- PANGEA.RootSeqSim.get.ancestral.seq.pg(tree, node.stat, tree.id.sep='_', tree.id.idx.mcmcit=2, tree.id.burnin=tree.id.burnin, label.sep=tree.id.labelsep, label.idx.ctime=5)
anc.seq[, POOL:= regmatches(file, regexpr('pool[0-9]+', file)) ]
set(anc.seq, NULL, 'BEAST_MCMC_IT', NULL )
anc.seq
})
anc.seq <- do.call('rbind',anc.seq)
set(anc.seq, NULL, 'POOL', anc.seq[, factor(POOL)])
set(anc.seq, NULL, 'GENE', anc.seq[, factor(GENE)])
set(anc.seq, NULL, 'TREE_ID', anc.seq[, factor(TREE_ID)])
anc.seq[, LABEL:=NULL]
# check we have exactly 3 genes for every inner node
tmp <- anc.seq[, list(n=length(GENE)), by=c('POOL','TREE_ID','NODE_ID')]
stopifnot(tmp[, all(n==3)])
# prelim save
file <- paste( outdir, '/', substr(files[1],1,nchar(files[1])-7), 'AncSeq_Raw.R',sep='' )
save(anc.seq, file=file)
# concatenate sequences in time order (so we don t loose too many sequences)
anc.seq.gag <- subset(anc.seq, GENE=='GAG')
setnames(anc.seq.gag, c('SEQ','CALENDAR_TIME','POOL'), c('GAG','GAG_CALENDAR_TIME','GAG_POOL'))
anc.seq.pol <- subset(anc.seq, GENE=='POL')
setnames(anc.seq.pol, c('SEQ','CALENDAR_TIME','POOL'), c('POL','POL_CALENDAR_TIME','POL_POOL'))
anc.seq.env <- subset(anc.seq, GENE=='ENV')
setnames(anc.seq.env, c('SEQ','CALENDAR_TIME','POOL'), c('ENV','ENV_CALENDAR_TIME','ENV_POOL'))
setkey(anc.seq.gag, GAG_CALENDAR_TIME)
setkey(anc.seq.pol, POL_CALENDAR_TIME)
setkey(anc.seq.env, ENV_CALENDAR_TIME)
#
anc.seq <- cbind( subset(anc.seq.gag, select=c(GAG, GAG_CALENDAR_TIME)), subset(anc.seq.pol, select=c(POL, POL_CALENDAR_TIME)) )
anc.seq <- cbind( anc.seq, subset(anc.seq.env, select=c(ENV, ENV_CALENDAR_TIME) ))
cat(paste('\nFound starting sequences, n=', nrow(anc.seq)))
# exclude concatenated genes if TIME_SEQ difference too large
anc.seq[, d.gp:= abs(GAG_CALENDAR_TIME-POL_CALENDAR_TIME)]
anc.seq[, d.ge:= abs(GAG_CALENDAR_TIME-ENV_CALENDAR_TIME)]
anc.seq[, d.pe:= abs(POL_CALENDAR_TIME-ENV_CALENDAR_TIME)]
anc.seq <- subset(anc.seq, d.gp<=ancseq.excl.timediff & d.ge<=ancseq.excl.timediff & d.pe<=ancseq.excl.timediff)
cat(paste('\nKeep starting sequences with sufficiently close TIME_SEQ, n=', nrow(anc.seq)))
anc.seq[, d.gp:=NULL]
anc.seq[, d.ge:=NULL]
anc.seq[, d.pe:=NULL]
# finalize
anc.seq[, CALENDAR_TIME:= (GAG_CALENDAR_TIME+POL_CALENDAR_TIME+ENV_CALENDAR_TIME)/3]
anc.seq[, GAG_CALENDAR_TIME:=NULL]
anc.seq[, POL_CALENDAR_TIME:=NULL]
anc.seq[, ENV_CALENDAR_TIME:=NULL]
anc.seq[, LABEL:= paste(ancseq.label.prefix, tree.id.labelsep, 'SEQ_', seq_len(nrow(anc.seq)), tree.id.labelsep, round(CALENDAR_TIME, d=4), sep='')]
anc.seq <- subset(anc.seq, CALENDAR_TIME>1935)
anc.seq.gag <- anc.seq.pol <- anc.seq.env <- NULL
gc()
#
# plot distribution of node times
#
tmp <- subset(anc.seq, select=c(CALENDAR_TIME))
ggplot(tmp, aes(x=CALENDAR_TIME)) + geom_histogram(binwidth=1) + scale_x_continuous(breaks=seq(1900,2020, 5), name='estimated inner node time\n(year)')
file <- paste( outdir, '/', substr(files[1],1,nchar(files[1])-7), 'AncSeq_Times.pdf',sep='' )
ggsave(file=file, w=10, h=6)
#
# return DNAbin
#
tmp <- c(seq(1, nrow(anc.seq), 5e4), nrow(anc.seq)+1)
anc.seq.gag <- lapply(seq_along(tmp)[-length(tmp)], function(i)
{
cat(paste('\nProcess GAG up to',tmp[i+1]-1))
anc.seq.gag <- tolower(do.call('rbind',strsplit(anc.seq[seq.int(tmp[i], tmp[i+1]-1), GAG],'')))
rownames(anc.seq.gag) <- anc.seq[seq.int(tmp[i], tmp[i+1]-1), LABEL]
anc.seq.gag <- as.DNAbin(anc.seq.gag)
})
anc.seq.gag <- do.call('rbind', anc.seq.gag)
anc.seq.pol <- lapply(seq_along(tmp)[-length(tmp)], function(i)
{
cat(paste('\nProcess POL up to',tmp[i+1]-1))
anc.seq.pol <- tolower(do.call('rbind',strsplit(anc.seq[seq.int(tmp[i], tmp[i+1]-1), POL],'')))
rownames(anc.seq.pol) <- anc.seq[seq.int(tmp[i], tmp[i+1]-1), LABEL]
anc.seq.pol <- as.DNAbin(anc.seq.pol)
})
anc.seq.pol <- do.call('rbind', anc.seq.pol)
anc.seq.env <- lapply(seq_along(tmp)[-length(tmp)], function(i)
{
cat(paste('\nProcess ENV up to',tmp[i+1]-1))
anc.seq.env <- tolower(do.call('rbind',strsplit(anc.seq[seq.int(tmp[i], tmp[i+1]-1), ENV],'')))
rownames(anc.seq.env) <- anc.seq[seq.int(tmp[i], tmp[i+1]-1), LABEL]
anc.seq.env <- as.DNAbin(anc.seq.env)
})
anc.seq.env <- do.call('rbind', anc.seq.env)
#
# return info data.table
#
set( anc.seq, NULL, 'GAG', NULL )
set( anc.seq, NULL, 'POL', NULL )
set( anc.seq, NULL, 'ENV', NULL )
anc.seq.info <- anc.seq
# save
file <- "/Users/Oliver/duke/2014_Gates/methods_comparison_rootseqsim/140907/PANGEA_SSAfgBwhRc-_140907_n390_AncSeq.R"
cat(paste('\nwrite Ancestral Sequences to ',file))
save(file=file, anc.seq.gag, anc.seq.pol, anc.seq.env, anc.seq.info)
}
##--------------------------------------------------------------------------------------------------------
## get anecestral sequences from BEAST XML
##--------------------------------------------------------------------------------------------------------
project.PANGEA.RootSeqSim.BEAST.SSAfg.getancestralseq.from.output<- function()
{
tree.id.burnin <- 2e7
tree.id.labelsep <- '|'
dir.name <- '/Users/Oliver/duke/2014_Gates'
indir <- paste(dir.name,'methods_comparison_rootseqsim/140813',sep='/')
outdir <- indir
# search for BEAST output
files <- list.files(indir)
files <- files[ sapply(files, function(x) grepl('pool[0-9].R$',x) ) ]
if(!length(files)) stop('cannot find files matching criteria')
# load and process BEAST PARSER output
anc.seq <- lapply(files, function(file)
{
cat(paste('\nProcess file=', file ))
load( paste(indir, file, sep='/') ) # expect tree, node.stat
# compute gag pol env ancestral sequences
anc.seq <- PANGEA.RootSeqSim.get.ancestral.seq(tree, node.stat, tree.id.sep='_', tree.id.idx.mcmcit=2, tree.id.burnin=tree.id.burnin, label.sep=tree.id.labelsep, label.idx.ctime=5)
set(anc.seq, NULL, 'LABEL', anc.seq[, paste( substr(file,1,nchar(file)-2), LABEL, sep=tree.id.labelsep )] )
set(anc.seq, NULL, 'TREE_ID', NULL )
set(anc.seq, NULL, 'NODE_ID', NULL )
set(anc.seq, NULL, 'BEAST_MCMC_IT', NULL )
anc.seq
})
anc.seq <- do.call('rbind',anc.seq)
#
# return DNAbin
#
anc.seq.gag <- tolower(do.call('rbind',strsplit(anc.seq[, GAG],'')))
rownames(anc.seq.gag) <- anc.seq[, LABEL]
anc.seq.gag <- as.DNAbin(anc.seq.gag)
anc.seq.pol <- tolower(do.call('rbind',strsplit(anc.seq[, POL],'')))
rownames(anc.seq.pol) <- anc.seq[, LABEL]
anc.seq.pol <- as.DNAbin(anc.seq.pol)
anc.seq.env <- tolower(do.call('rbind',strsplit(anc.seq[, ENV],'')))
rownames(anc.seq.env) <- anc.seq[, LABEL]
anc.seq.env <- as.DNAbin(anc.seq.env)
set( anc.seq, NULL, 'GAG', NULL )
set( anc.seq, NULL, 'POL', NULL )
set( anc.seq, NULL, 'ENV', NULL )
anc.seq.info <- anc.seq
#anc.seq <- cbind(anc.seq.gag, anc.seq.pol, anc.seq.env)
#
outfile <- paste( substr(files[1],1,nchar(files[1])-7), 'AncSeq.R',sep='' )
file <- paste(outdir, outfile, sep='/')
cat(paste('\nwrite Ancestral Sequences to ',file))
save(file=file, anc.seq.gag, anc.seq.pol, anc.seq.env, anc.seq.info)
}
##--------------------------------------------------------------------------------------------------------
## check simulated sequences: create NJ tree and estimate R2
## olli 14.09.14
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.SSApg.NJR2<- function()
{
require(phytools)
tree.id.labelsep <- '|'
tree.id.label.idx.ctime <- 4
indir <- '/Users/Oliver/git/HPTN071sim/tmp140914/140716_RUN001_INTERNAL'
outdir <- '/Users/Oliver/git/HPTN071sim/tmp140914/140716_RUN001_INTERNAL'
#indir <- '/Users/Oliver/git/HPTN071sim/tmp140912/140911_DSPS_RUN002_INTERNAL'
#outdir <- '/Users/Oliver/git/HPTN071sim/tmp140912/140911_DSPS_RUN002_INTERNAL'
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201502/Regional_unblinded'
outdir <- '~/Dropbox (SPH Imperial College)/2015_PANGEA_MCE_manuscript/figures'
infiles <- list.files(indir, '.*INTERNAL.R$', full.names=FALSE)
for(i in seq_along(infiles))
{
i<-19#i<- 8 #i<- 4
infile <- infiles[i]
# load simulated data
file <- paste(indir, '/', infile, sep='')
cat(paste('\nLoading file', file))
load(file) #expect "df.epi" "df.trms" "df.inds" "df.sample" "df.seq"
# load outgroup sequences
file <- system.file(package="PANGEA.HIV.sim", "misc",'PANGEA_SSAfg_HXB2outgroup.R')
cat(paste('\nLoading outgroup seq from file', file))
load(file) #expect "outgroup.seq.gag" "outgroup.seq.pol" "outgroup.seq.env"
tmp <- tolower(do.call('rbind',strsplit(df.seq[, GAG],'')))
rownames(tmp) <- df.seq[, LABEL]
df.seq.gag <- as.DNAbin(tmp)
tmp <- tolower(do.call('rbind',strsplit(df.seq[, POL],'')))
rownames(tmp) <- df.seq[, LABEL]
df.seq.pol <- as.DNAbin(tmp)
tmp <- tolower(do.call('rbind',strsplit(df.seq[, ENV],'')))
rownames(tmp) <- df.seq[, LABEL]
df.seq.env <- as.DNAbin(tmp)
#
# get R2 for df.seq.pol
#
seq <- df.seq.pol
seq <- rbind(seq, outgroup.seq.pol[, seq_len(ncol(seq))])
# get NJ tree
tmp <- dist.dna(seq, model='raw')
nj <- nj(tmp)
tmp <- which(nj$tip.label=="HXB2")
nj <- reroot(nj, tmp, nj$edge.length[which(nj$edge[,2]==tmp)])
nj <- ladderize(nj)
if(0)
{
file <- paste( outdir, '/', substr(infile,1,nchar(infile)-20),'INFO_simu_NJpol.pdf', sep='' )
pdf(file=file, w=10, h=150)
plot(nj, show.tip=TRUE, cex=0.5)
add.scale.bar()
dev.off()
}
# get root to tip divergence on pol
nj <- drop.tip(nj,'HXB2')
tmp <- node.depth.edgelength(nj)
nj.info <- data.table(LABEL=nj$tip.label, ROOT2TIP=tmp[seq_len(Ntip(nj))] )
set(nj.info, NULL, 'CALENDAR_TIME', nj.info[, as.numeric(sapply(strsplit(LABEL, tree.id.labelsep, fixed=TRUE),'[[',tree.id.label.idx.ctime))] )
tmp <- lm(ROOT2TIP~CALENDAR_TIME, data=nj.info)
set( nj.info, NULL, 'ROOT2TIP_LM', predict(tmp, type='response') )
tmp2 <- c( R2=round(summary(tmp)$r.squared,d=3), SLOPE= as.numeric(round(coef(tmp)['CALENDAR_TIME'],d=4)), TMRCA=as.numeric(round( -coef(tmp)['(Intercept)']/coef(tmp)['CALENDAR_TIME'], d=1 )) )
ggplot(nj.info, aes(x=CALENDAR_TIME, y=ROOT2TIP)) + geom_point(alpha=0.5, colour='Orange') + geom_line(aes(y=ROOT2TIP_LM)) +
#scale_x_continuous(breaks=seq(1980,2020,2)) +
labs(x='\nSequence sampling date\n(Year)', y='Genetic distance of pol sequences from root\n(subst/site)\n') +
annotate("text", x=nj.info[, min(CALENDAR_TIME)], y=nj.info[, 0.9*max(ROOT2TIP)], label=paste("R2=", tmp2['R2'],'\nSlope=',tmp2['SLOPE'],'\nTMRCA=',tmp2['TMRCA'], sep=''), hjust = 0, size = 4) +
theme(legend.position=c(0,1), legend.justification=c(0,1))
file <- paste( outdir, '/R2_', substr(infile,1,nchar(infile)-20),'_NJpol.pdf', sep='' )
ggsave(file=file, w=10, h=5)
#
seq <- df.seq.env
seq <- rbind(seq, outgroup.seq.env[, seq_len(ncol(seq))])
# get NJ tree
tmp <- dist.dna(seq, model='raw')
nj <- nj(tmp)
tmp <- which(nj$tip.label=="HXB2")
nj <- reroot(nj, tmp, nj$edge.length[which(nj$edge[,2]==tmp)])
nj <- ladderize(nj)
# get root to tip divergence on pol
nj <- drop.tip(nj,'HXB2')
tmp <- node.depth.edgelength(nj)
nj.info <- data.table(LABEL=nj$tip.label, ROOT2TIP=tmp[seq_len(Ntip(nj))] )
set(nj.info, NULL, 'CALENDAR_TIME', nj.info[, as.numeric(sapply(strsplit(LABEL, tree.id.labelsep, fixed=TRUE),'[[',tree.id.label.idx.ctime))] )
tmp <- lm(ROOT2TIP~CALENDAR_TIME, data=nj.info)
set( nj.info, NULL, 'ROOT2TIP_LM', predict(tmp, type='response') )
tmp2 <- c( R2=round(summary(tmp)$r.squared,d=3), SLOPE= as.numeric(round(coef(tmp)['CALENDAR_TIME'],d=4)), TMRCA=as.numeric(round( -coef(tmp)['(Intercept)']/coef(tmp)['CALENDAR_TIME'], d=1 )) )
ggplot(nj.info, aes(x=CALENDAR_TIME, y=ROOT2TIP)) + geom_point(alpha=0.5, colour='Orange') + geom_line(aes(y=ROOT2TIP_LM)) +
#scale_x_continuous(breaks=seq(1980,2020,2)) +
labs(x='\nSequence sampling date\n(Year)', y='Genetic distance of env sequences from root\n(subst/site)\n') +
annotate("text", x=nj.info[, min(CALENDAR_TIME)], y=nj.info[, 0.9*max(ROOT2TIP)], label=paste("R2=", tmp2['R2'],'\nSlope=',tmp2['SLOPE'],'\nTMRCA=',tmp2['TMRCA'], sep=''), hjust = 0, size = 4) +
theme(legend.position=c(0,1), legend.justification=c(0,1))
file <- paste( outdir, '/R2_', substr(infile,1,nchar(infile)-20),'_NJenv.pdf', sep='' )
ggsave(file=file, w=10, h=5)
}
#
# get R2 for concatenated genome
#
seq <- cbind(df.seq.gag,df.seq.pol,df.seq.env)
tmp <- cbind(outgroup.seq.gag[,1:ncol(df.seq.gag)], outgroup.seq.pol, outgroup.seq.env)
seq <- rbind(seq,tmp)
# get NJ tree
tmp <- dist.dna(seq, model='raw')
nj <- nj(tmp)
tmp <- which(nj$tip.label=="HXB2")
nj <- reroot(nj, tmp, nj$edge.length[which(nj$edge[,2]==tmp)])
nj <- ladderize(nj)
file <- paste( outdir, '/', substr(infile,1,nchar(infile)-20),'INFO_simu_NJ.pdf', sep='' )
pdf(file=file, w=10, h=150)
plot(nj, show.tip=TRUE, cex=0.5)
add.scale.bar()
dev.off()
# get root to tip divergence
nj <- drop.tip(nj,'HXB2')
tmp <- node.depth.edgelength(nj)
nj.info <- data.table(LABEL=nj$tip.label, ROOT2TIP=tmp[seq_len(Ntip(nj))] )
set(nj.info, NULL, 'CALENDAR_TIME', nj.info[, as.numeric(sapply(strsplit(LABEL, tree.id.labelsep, fixed=TRUE),'[[',tree.id.label.idx.ctime))] )
tmp <- lm(ROOT2TIP~CALENDAR_TIME, data=nj.info)
set( nj.info, NULL, 'ROOT2TIP_LM', predict(tmp, type='response') )
tmp2 <- c( R2=round(summary(tmp)$r.squared,d=3), SLOPE= as.numeric(round(coef(tmp)['CALENDAR_TIME'],d=4)), TMRCA=as.numeric(round( -coef(tmp)['(Intercept)']/coef(tmp)['CALENDAR_TIME'], d=1 )) )
ggplot(nj.info, aes(x=CALENDAR_TIME, y=ROOT2TIP)) + geom_point(alpha=0.5) + geom_line(aes(y=ROOT2TIP_LM)) +
#scale_x_continuous(breaks=seq(1980,2020,2)) +
labs(x='Sequence sampling date', y='root-to-tip divergence\n(HIV-1 concatenated sequences)') +
annotate("text", x=nj.info[, min(CALENDAR_TIME)], y=nj.info[, 0.9*max(ROOT2TIP)], label=paste("R2=", tmp2['R2'],'\nSlope=',tmp2['SLOPE'],'\nTMRCA=',tmp2['TMRCA'], sep=''), hjust = 0, size = 4) +
theme(legend.position=c(0,1), legend.justification=c(0,1))
file <- paste( outdir, '/', substr(infile,1,nchar(infile)-20),'INFO_simu_NJR2.pdf', sep='' )
ggsave(file=file, w=10, h=6)
}
##--------------------------------------------------------------------------------------------------------
## check simulated sequences: create ExaML tree and estimate R2
## olli 18.09.14
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.scenarios<- function()
{
if(0)
{
indir <- system.file(package="PANGEA.HIV.sim", "misc")
infile.ind <- '170914_HPTN071_scA_rep1'
infile.trm <- '170914_HPTN071_scA_rep1'
label <- ''
pipeline.args <- sim.regional.args( yr.start=1980, yr.end=2020, seed=42,
s.INC.recent=0.1, s.INC.recent.len=5, s.PREV.min=0.01, s.PREV.max=0.2,
epi.model='HPTN071', epi.dt=1/48, epi.import=0.1,
v.N0tau=1, v.r=2.851904, v.T50=-2,
wher.mu=log(0.006716145)-0.37^2/2, wher.sigma=0.37,
bwerm.mu=log(0.002239075)-0.05^2/2, bwerm.sigma=0.05)
tmpdir <- '/Users/Oliver/git/HPTN071sim/HscABase140918'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
#
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
#system(file)
}
}
##--------------------------------------------------------------------------------------------------------
## rename simulated sequences
## olli 27.10.14
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.pipeline.November2014.renameVillage<- function()
{
indir <- '/Users/Oliver/duke/2014_Gates/meeting_2014_09_DSPSupdate/CD4-toBlind'
infiles <- data.table(FILE= list.files(path=indir))
set(infiles, NULL, 'EPI', infiles[, sapply(strsplit(FILE, '_'),'[[',1)])
set(infiles, NULL, 'EPI', infiles[, factor(EPI, levels=c('A','B','C'), labels=c(1,2,3))])
set(infiles, NULL, 'TIMEPOINT', infiles[, sapply(strsplit(FILE, '_'),'[[',3)])
set(infiles, NULL, 'TIMEPOINT', infiles[, substr(TIMEPOINT,1,1)])
set(infiles, NULL, 'END', infiles[, sapply(strsplit(FILE, '.', fixed=TRUE),'[[',2)])
tmp <- unique(subset(infiles, select=c(EPI, TIMEPOINT)))
set(tmp, NULL, 'TIMEPOINT_BLINDED', c('B','A','C', 'D','F','E', 'I','H','G'))
infiles <- merge(infiles, tmp, by=c('EPI','TIMEPOINT'))
set(infiles, NULL, 'SAMPLE', 1)
set(infiles, infiles[, which(EPI==3 & grepl('60', FILE))], 'SAMPLE', 2)
set(infiles, NULL, 'TO', infiles[, paste('281014','_Village_sc',TIMEPOINT_BLINDED,'_sample',SAMPLE,'_epi',EPI,'.',END,sep='')])
infiles[, file.rename(paste(indir,'/',FILE,sep=''), paste(indir,'/',TO,sep='')), by='FILE']
}
##--------------------------------------------------------------------------------------------------------
## rename simulated sequences
## olli 26.10.14
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.pipeline.November2014.rename<- function()
{
indir <- '/Users/Oliver/git/Pangea__Oct27_2014_code'
indir <- '/Users/Oliver/git/Pangea__Oct27_2014_code'
infiles <- data.table(FILE= list.files(path=indir, pattern='*SCENARIO.*.csv'))
set(infiles, NULL, 'DATE', infiles[, substr(FILE, 1, 6)])
set(infiles, NULL, 'END', infiles[, substr(FILE, nchar(FILE)-6, nchar(FILE))])
set(infiles, NULL, 'REP', infiles[, regmatches(FILE,regexpr('RUN[0-9]+',FILE))])
set(infiles, NULL, 'REP', infiles[, as.numeric(substr(REP, 4, nchar(REP)))])
set(infiles, NULL, 'SCENARIO', infiles[, regmatches(FILE,regexpr('SCENARIO_[0-3]',FILE))])
set(infiles, NULL, 'SCENARIO', infiles[, substr(SCENARIO, nchar(SCENARIO), nchar(SCENARIO))])
set(infiles, infiles[, which(SCENARIO=='0')], 'SCENARIO', 'A')
set(infiles, infiles[, which(SCENARIO=='3')], 'SCENARIO', 'B')
set(infiles, infiles[, which(SCENARIO=='2')], 'SCENARIO', 'C')
set(infiles, NULL, 'TO', infiles[, paste(DATE,'_HPTN071_sc',SCENARIO,'_rep',REP,'_',END,sep='')])
infiles[, file.rename(paste(indir,'/',FILE,sep=''), paste(indir,'/',TO,sep='')), by='FILE']
}
##--------------------------------------------------------------------------------------------------------
## check simulated sequences: create ExaML tree and estimate R2
## olli 03.11.14
##--------------------------------------------------------------------------------------------------------
project.PANGEA.logo<- function()
{
treelabel.idx.sep <- '|'
infile <- "/Users/Oliver/Dropbox\ (Infectious Disease)/PANGEAHIVsim_internal/freeze_Nov14/freeze1028_HPTN071_seqsimA/281014_HPTN071_scA_rep1_INTERNAL/281014_HPTN071_scA_rep1_SIMULATED_INTERNAL.R"
load(infile)
# concatenate sequences
tmp <- tolower(do.call('rbind',strsplit(df.seq[, paste(GAG,POL,ENV,sep='')],'')))
rownames(tmp) <- df.seq[, paste(IDCLU,treelabel.idx.sep,LABEL,sep='')]
seq <- as.DNAbin(tmp)
#tmp <- cbind(outgroup.seq.gag[,1:ncol(df.seq.gag)], outgroup.seq.pol, outgroup.seq.env)
#seq <- rbind(seq,tmp)
seq.ph <- nj(dist.dna(seq, model='raw'))
seq.ph <- ladderize(seq.ph)
# plot
file <- paste('/Users/Oliver/duke/2014_Gates', '/', 'logo_ph.pdf', sep='')
pdf(file=file, w=10, h=10)
plot(seq.ph, type='fan',show.tip.label=0)
dev.off()
}
##--------------------------------------------------------------------------------------------------------
## check simulated sequences: create ExaML tree and estimate R2
## olli 23.10.14
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.pipeline.November2014.plotrepclicates<- function()
{
indir <- '/Users/Oliver/git/HPTN071sim/freeze1028'
infiles <- list.files(path=indir, pattern='*INTERNAL.R', recursive=TRUE)
infiles <- data.table(FILE=infiles[grepl('281014',infiles)])
infiles[, SCENARIO:=regmatches(FILE,regexpr('sc[A-C]',FILE))]
infiles[, REP:=regmatches(FILE,regexpr('rep[0-9]+',FILE))]
set(infiles, NULL, 'SCENARIO', infiles[, substr(SCENARIO, 3, 3)])
set(infiles, NULL, 'REP', infiles[, as.numeric(substr(REP, 4, nchar(REP)))])
set(infiles, NULL, 'REP2', 1:3)
setkey(infiles, SCENARIO, REP)
df.epi <- infiles[, {
z<- load( paste(indir, '/', FILE, sep='') )
df.epi[, SCENARIO:=SCENARIO]
df.epi[, REP2:=REP2]
df.epi
}, by=c('SCENARIO','REP')]
ggplot( df.epi, aes(x=YR, y=INC, group=interaction(SCENARIO,REP), colour=SCENARIO) ) + geom_line() + theme_bw() +
scale_x_continuous(limits=c(1980,2020), breaks=seq(1980,2020,2)) +
scale_y_continuous(breaks=seq(0,2000,100))
ggplot( df.epi, aes(x=YR, y=INCp*100, group=interaction(SCENARIO,REP), colour=interaction(SCENARIO,REP)) ) + geom_line(size=1.5) + theme_bw() +
#scale_fill_manual (values=colours, guide=FALSE) +
scale_colour_hue(c=50, l=60, guide=FALSE) +
#scale_colour_brewer(palette='Accent', guide=FALSE) +
scale_x_continuous(limits=c(1980,2020), breaks=seq(1980,2020,10), expand=c(0,0)) +
scale_y_continuous(breaks=seq(0,5,0.5)) +
labs(x='', y='% Incidence') +
theme(panel.grid.major=element_line(colour="grey30", size=0.2))
file<- paste(indir,'/inc_scenarios.pdf',sep='')
ggsave(file=file, width=6, height=4)
}
##--------------------------------------------------------------------------------------------------------
## check simulated sequences: create ExaML tree and estimate R2
## olli 26.01.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.pipeline.Feb2015.dev<- function()
{
if(0)
{
#check simu tree
file <- '~/git/HPTN071sim/tmp151027-o0PU/150127_RUN001-o0PU_SIMULATED_DATEDTREE/150127_RUN001-o0PU_DATEDTREE.newick'
ph <- read.tree(file)
df <- data.table( LABEL=ph$tip.label,
TIME_SEQ= as.numeric(sapply( strsplit(ph$tip.label, '|' , fixed=TRUE), '[[', 4 )),
TIME2ROOT=node.depth.edgelength(ph)[seq_len(Ntip(ph))] )
ggplot(df, aes(x=TIME_SEQ, y=TIME2ROOT)) + geom_point()
#straight line as expected
}
if(0)
{
indir <- '/Users/Oliver/Dropbox\ (Infectious Disease)/PANGEAHIVsim_internal/freeze_Jan15/regional/150125'
infile <- 'Annual_outputs_CL19_SA_A_V0_Rand1_Run48.csv'
file <- paste(indir,infile,sep='/')
df <- as.data.table(read.csv(file))
set(df, NULL, 'X', NULL)
tmp <- names(df)[ which(as.character(sapply(df, class))!='numeric') ]
for(x in tmp)
set(df, NULL, x, as.numeric(df[[x]]))
df[, 'Percent\nINC':= df[, NewCasesThisYear/(TotalPopulation-Prevalence)]]
df <- melt(df, id.vars=c('Year'))
set(df, NULL, 'variable', df[, gsub('_','\n',variable)])
set(df, NULL, 'variable', df[, gsub('MeanN','MeanN\n',variable)])
set(df, NULL, 'variable', df[, gsub('NewCases','NewCases\n',variable)])
set(df, NULL, 'variable', df[, gsub('NHIVTested','NHIVTested\n',variable)])
set(df, NULL, 'variable', df[, gsub('Cumulative','Cumulative\n',variable)])
set(df, NULL, 'variable', df[, gsub('Number','Number\n',variable)])
set(df, NULL, 'variable', df[, gsub('PropHIV','PropHIV\n',variable)])
set(df, NULL, 'variable', df[, gsub('PropAnnual','PropAnnual\n',variable)])
# paste(df[, unique(variable)], collapse='\',\'')
tmp <- c('Prevalence','Number\nPositive','NewCases\nThisYear','Percent\nINC','PropAnnual\nAcute','PropHIV\nPosONART','NAnnual','TotalPopulation','Number\nPositiveM','PopulationM','Number\nPositiveF','PopulationF','Cumulative\nNonPopartHIVtests','Cumulative\nPopartHIVtests','Cumulative\nNonPopartCD4tests','Cumulative\nPopartCD4tests','NHIVTested\nThisYear','NOnARTM','NNeedARTM','NOnARTF','NNeedARTF','PropMenCirc','Prop\nriskLow','Prop\nriskMed','Prop\nriskHigh','Prevalence\nriskLow','Prevalence\nriskMed','Prevalence\nriskHigh','MeanN\ncurrentpart\nriskLow','MeanN\ncurrentpart\nriskMed','MeanN\ncurrentpart\nriskHigh','MeanN\ncurrentsdpart\nriskLow','MeanN\ncurrentsdpart\nriskMed','MeanN\ncurrentsdpart\nriskHigh','MeanN\newPartnersthisyear\nriskLow','MeanN\newPartnersthisyear\nriskMed','MeanN\newPartnersthisyear\nriskHigh','MeanN\nlifetimepart\nriskLow','MeanN\nlifetimepart\nriskMed','MeanN\nlifetimepart\nriskHigh')
set(df, NULL, 'variable', df[, factor(variable, levels=tmp, labels=tmp)])
ggplot(df, aes(x=Year, y=value, group=variable)) + geom_step(with.guide=FALSE) +
facet_grid(variable~., scales='free_y') +
scale_x_continuous(breaks=seq(1950,2050,by=10)) + theme_bw() +
theme(strip.text=element_text(size=7))
file <- paste(indir,gsub('\\.csv','\\.pdf',infile),sep='/')
ggsave(file=file, w=8, h=40)
}
#
# new epi model; new Sampling model based on diagnoses; prelim scenario Acute Low + ART Fast
#
if(0)
{
indir <- '/Users/Oliver/git/HPTN071sim/source/PANGEA.HIV.sim/inst/misc'
pipeline.args <- sim.regional.args( yr.start=1980, yr.end=2020, seed=42, s.MODEL='Prop2DiagB4I',
s.PREV.max=0.08, s.INTERVENTION.start=2015, s.INTERVENTION.mul= 2, s.ARCHIVAL.n=50,
epi.model='HPTN071', epi.dt=1/48, epi.import=0.05,
v.N0tau=1, v.r=2.851904, v.T50=-2,
wher.mu=log(0.003358613)-0.3^2/2, wher.sigma=0.3, bwerm.mu=log(0.002239075)-0.13^2/2, bwerm.sigma=0.13, er.gamma=NA,
sp.prop.of.sexactive= 0.05, report.prop.recent=1.0,
dbg.GTRparam=0, dbg.rER=0, index.starttime.mode='fix1955', startseq.mode='one', seqtime.mode='AtDiag')
# proposed standard run and control simulation
pipeline.vary <- data.table( seqtime.mode= c('AtDiag','AtDiag','AtDiag','AtDiag','AtDiag','AtTrm'),
s.MODEL= c('Prop2DiagB4I','Prop2DiagB4I','Prop2DiagB4I','Prop2DiagB4I','Prop2DiagB4I','Prop2Untreated'),
startseq.mode= c('one','one', 'one','many','one','one'),
epi.import= c(0.05, 0.05, 0.05, 0.05, 0, 0),
wher.mu= c(log(0.003358613), log(0.003358613)-0.3^2/2, log(0.003358613)-0.3^2/2, log(0.003358613)-0.3^2/2, log(0.002239075)-0.13^2/2, log(0.002239075)-0.13^2/2),
wher.sigma= c(0, 0.3, 0.3, 0.3,0.13,0.13),
bwerm.mu= c(log(0.002239075), log(0.002239075)-0.13^2/2, log(0.002239075)-0.13^2/2, log(0.002239075)-0.13^2/2, log(0.002239075)-0.13^2/2, log(0.002239075)-0.13^2/2),
bwerm.sigma= c(0, 0.13, 0.13, 0.13, 0.13, 0.13),
dbg.GTRparam= c(1, 1, 0, 0, 0, 0),
dbg.rER= c(1, 1, 0, 0, 0, 0),
label= c('-o5111DI', '-o5011DI', '-o5DI','-m5DI','-o0DI','-o0PU'))
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='seqtime.mode' ), 'v', as.character(seqtime.mode))
set(pipeline.args, which( pipeline.args$stat=='s.MODEL' ), 'v', as.character(s.MODEL))
set(pipeline.args, which( pipeline.args$stat=='startseq.mode' ), 'v', as.character(startseq.mode))
set(pipeline.args, which( pipeline.args$stat=='epi.import' ), 'v', as.character(epi.import))
set(pipeline.args, which( pipeline.args$stat=='wher.mu' ), 'v', as.character(wher.mu))
set(pipeline.args, which( pipeline.args$stat=='wher.sigma' ), 'v', as.character(wher.sigma))
set(pipeline.args, which( pipeline.args$stat=='bwerm.mu' ), 'v', as.character(bwerm.mu))
set(pipeline.args, which( pipeline.args$stat=='bwerm.sigma' ), 'v', as.character(bwerm.sigma))
set(pipeline.args, which( pipeline.args$stat=='dbg.GTRparam' ), 'v', as.character(dbg.GTRparam))
set(pipeline.args, which( pipeline.args$stat=='dbg.rER' ), 'v', as.character(dbg.rER))
print(pipeline.args)
# scenario Acute Low + ART Fast
infile.ind <- '150127_RUN001'
infile.trm <- '150127_RUN001'
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp151030'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind, label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm, label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
system(file)
}, by='label']
}
#
# check control: why so much variation in root to tip divergence?
#
if(0)
{
indir <- '/Users/Oliver/git/HPTN071sim/source/PANGEA.HIV.sim/inst/misc'
pipeline.args <- sim.regional.args( yr.start=1980, yr.end=2020, seed=42, s.MODEL='Prop2DiagB4I',
s.PREV.max=0.08, s.INTERVENTION.start=2015, s.INTERVENTION.mul= 2, s.ARCHIVAL.n=50,
epi.model='HPTN071', epi.dt=1/48, epi.import=0.05,
v.N0tau=1, v.r=2.851904, v.T50=-2,
wher.mu=log(0.003358613)-0.3^2/2, wher.sigma=0.3, bwerm.mu=log(0.002239075)-0.13^2/2, bwerm.sigma=0.13, er.gamma=NA,
dbg.GTRparam=0, dbg.rER=0, index.starttime.mode='fix1955', startseq.mode='one', seqtime.mode='AtDiag')
# proposed standard run and control simulation
pipeline.vary <- data.table( seqtime.mode= c('AtTrm','AtTrm','AtTrm','AtTrm'),
s.MODEL= c('Prop2Untreated','Prop2Untreated','Prop2Untreated','Prop2Untreated'),
startseq.mode= c('one','one','one','one'),
epi.import= c(0, 0, 0, 0),
wher.mu= c(log(0.002239075)-0.13^2/2, log(0.002239075)-0.13^2/2, log(0.002239075)-0.13^2/2, log(0.002239075)),
wher.sigma= c(0.13, 0.13, 0.13, 0),
dbg.GTRparam= c(0, 1, 1, 1),
dbg.rER= c(0, 0, 1, 1),
label= c('-o0PU000','-o0PU010', '-o0PU011', '-o0PU111'))
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='seqtime.mode' ), 'v', as.character(seqtime.mode))
set(pipeline.args, which( pipeline.args$stat=='s.MODEL' ), 'v', as.character(s.MODEL))
set(pipeline.args, which( pipeline.args$stat=='startseq.mode' ), 'v', as.character(startseq.mode))
set(pipeline.args, which( pipeline.args$stat=='epi.import' ), 'v', as.character(epi.import))
set(pipeline.args, which( pipeline.args$stat=='wher.mu' ), 'v', as.character(wher.mu))
set(pipeline.args, which( pipeline.args$stat=='wher.sigma' ), 'v', as.character(wher.sigma))
set(pipeline.args, which( pipeline.args$stat=='dbg.GTRparam' ), 'v', as.character(dbg.GTRparam))
set(pipeline.args, which( pipeline.args$stat=='dbg.rER' ), 'v', as.character(dbg.rER))
print(pipeline.args)
# scenario Acute Low + ART Fast
infile.ind <- '150127_RUN001'
infile.trm <- '150127_RUN001'
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp150130'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind, label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm, label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
#system(file)
}, by='label']
}
#
# after bugfixes, R2 is 66%! Need to add more variability ..
#
if(1)
{
indir <- '/Users/Oliver/git/HPTN071sim/source/PANGEA.HIV.sim/inst/misc'
pipeline.args <- sim.regional.args( yr.start=1980, yr.end=2020, seed=42, s.MODEL='Prop2DiagB4I',
s.PREV.max=0.08, s.INTERVENTION.start=2015, s.INTERVENTION.mul= 2, s.ARCHIVAL.n=50,
epi.model='HPTN071', epi.dt=1/48, epi.import=0.05,
v.N0tau=1, v.r=2.851904, v.T50=-2,
wher.mu=log(0.003358613)-0.3^2/2, wher.sigma=0.3, bwerm.mu=log(0.002239075)-0.13^2/2, bwerm.sigma=0.13, er.gamma=NA,
dbg.GTRparam=0, dbg.rER=0, index.starttime.mode='fix1955', startseq.mode='one', seqtime.mode='AtDiag')
# proposed standard run and control simulation
pipeline.vary <- data.table( seqtime.mode= c('AtDiag', 'AtDiag','AtDiag','AtDiag','Exp3','Exp3'),
s.MODEL= c('Prop2DiagB4I','Prop2DiagB4I','Prop2DiagB4I','Prop2DiagB4I','Prop2DiagB4I','Prop2DiagB4I'),
wher.mu= c(log(0.00447743)-0.3^2/2, log(0.00447743)-0.5^2/2, log(0.00447743)-0.3^2/2, log(0.00447743)-0.5^2/2, log(0.002239075)-0.13^2/2, log(0.002239075)-0.13^2/2),
wher.sigma= c(0.3, 0.5, 0.3, 0.5, 0.3, 0.5),
er.gamma= c(NA, NA, 4, 4, 4, 4),
label= c('-o30DI', '-o50DI', '-o34DI','-o54DI','-o34E3','-o54E3'))
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='seqtime.mode' ), 'v', as.character(seqtime.mode))
set(pipeline.args, which( pipeline.args$stat=='s.MODEL' ), 'v', as.character(s.MODEL))
set(pipeline.args, which( pipeline.args$stat=='wher.mu' ), 'v', as.character(wher.mu))
set(pipeline.args, which( pipeline.args$stat=='wher.sigma' ), 'v', as.character(wher.sigma))
set(pipeline.args, which( pipeline.args$stat=='er.gamma' ), 'v', as.character(er.gamma))
print(pipeline.args)
# scenario Acute Low + ART Fast
infile.ind <- '150127_RUN001'
infile.trm <- '150127_RUN001'
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp150130b'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind, label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm, label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
#system(file)
}, by='label']
}
#
# -o54DI brings R2 down to 51%, need more
#
if(1)
{
indir <- '/Users/Oliver/git/HPTN071sim/source/PANGEA.HIV.sim/inst/misc'
pipeline.args <- sim.regional.args( yr.start=1980, yr.end=2020, seed=42, s.MODEL='Prop2DiagB4I',
s.PREV.max=0.08, s.INTERVENTION.start=2015, s.INTERVENTION.mul= 2, s.ARCHIVAL.n=50,
epi.model='HPTN071', epi.dt=1/48, epi.import=0.05,
v.N0tau=1, v.r=2.851904, v.T50=-2,
wher.mu=log(0.003358613)-0.3^2/2, wher.sigma=0.3, bwerm.mu=log(0.002239075)-0.13^2/2, bwerm.sigma=0.13, er.gamma=NA,
dbg.GTRparam=0, dbg.rER=0, index.starttime.mode='fix1955', startseq.mode='one', seqtime.mode='AtDiag')
# proposed standard run and control simulation
pipeline.vary <- data.table( seqtime.mode= c('AtDiag','DUnif5', 'DUnif9','DUnif5','DUnif9','DUnif5','DUnif9'),
s.MODEL= c('Prop2DiagB4I','Prop2DiagB4I','Prop2DiagB4I','Prop2DiagB4I','Prop2DiagB4I','Prop2DiagB4I','Prop2DiagB4I'),
wher.mu= c(log(0.00447743)-0.5^2/2, log(0.00447743)-0.5^2/2, log(0.00447743)-0.5^2/2, log(0.00447743)-0.5^2/2, log(0.00447743)-0.5^2/2, log(0.00447743)-0.5^2/2, log(0.00447743)-0.5^2/2),
wher.sigma= c(0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5),
er.gamma= c(4, 4, 4, 4, 4, 4, 4),
index.starttime.mode= c('fix1955','fix1955','fix1955','fix1955','fix1955','normal','normal'),
startseq.mode= c('one','one','one','many','many','many','many'),
label= c('-o54DI', '-o54D5', '-o54D9', '-m54D5', '-m54D9', '-f54D5', '-f54D9'))
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='seqtime.mode' ), 'v', as.character(seqtime.mode))
set(pipeline.args, which( pipeline.args$stat=='s.MODEL' ), 'v', as.character(s.MODEL))
set(pipeline.args, which( pipeline.args$stat=='wher.mu' ), 'v', as.character(wher.mu))
set(pipeline.args, which( pipeline.args$stat=='wher.sigma' ), 'v', as.character(wher.sigma))
set(pipeline.args, which( pipeline.args$stat=='er.gamma' ), 'v', as.character(er.gamma))
set(pipeline.args, which( pipeline.args$stat=='index.starttime.mode' ), 'v', as.character(index.starttime.mode))
set(pipeline.args, which( pipeline.args$stat=='startseq.mode' ), 'v', as.character(startseq.mode))
print(pipeline.args)
# scenario Acute Low + ART Fast
infile.ind <- '150127_RUN001'
infile.trm <- '150127_RUN001'
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp150131b'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind, label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm, label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
#system(file)
}, by='label']
}
#
# -m54D5 brings R2 down to 42%. not too bad. try and get div overall a bit higher
#
if(0)
{
indir <- '/Users/Oliver/git/HPTN071sim/source/PANGEA.HIV.sim/inst/misc'
pipeline.args <- sim.regional.args( yr.start=1980, yr.end=2020, seed=42, s.MODEL='Prop2DiagB4I',
s.PREV.max=0.08, s.INTERVENTION.start=2015, s.INTERVENTION.mul= 2, s.ARCHIVAL.n=50,
epi.model='HPTN071', epi.dt=1/48, epi.import=0.05,
v.N0tau=1, v.r=2.851904, v.T50=-2,
wher.mu=log(0.00447743)-0.5^2/2, wher.sigma=0.5, bwerm.mu=log(0.002239075)-0.13^2/2, bwerm.sigma=0.13, er.gamma=4,
dbg.GTRparam=0, dbg.rER=0, index.starttime.mode='fix1955', startseq.mode='many', seqtime.mode='AtDiag')
# proposed standard run and control simulation
pipeline.vary <- data.table( seqtime.mode= c('DUnif5','DUnif5','DUnif5','DUnif5','DUnif5','DUnif5','AtTrm'),
s.MODEL= c('Prop2DiagB4I','Prop2DiagB4I','Prop2DiagB4I','Prop2DiagB4I','Prop2DiagB4I','Prop2DiagB4I','Prop2Untreated'),
index.starttime.mode= c('fix1955','fix1955','fix1955','fix1955','normal','normal','fix1955'),
bwerm.mu= c(log(0.002239075)-0.16^2/2, log(0.002239075)-0.2^2/2, log(0.002239075)-0.25^2/2, log(0.002239075)-0.3^2/2, log(0.002239075)-0.2^2/2, log(0.002239075)-0.3^2/2,log(0.002239075)-0.16^2/2),
bwerm.sigma= c(0.16, 0.2, 0.25, 0.3, 0.2, 0.3, 0.16),
startseq.mode= c('many','many','many','many','many','many','one'),
label= c('-m16', '-m20', '-m25', '-m30', '-n20', '-n30', '-o16'))
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='seqtime.mode' ), 'v', as.character(seqtime.mode))
set(pipeline.args, which( pipeline.args$stat=='s.MODEL' ), 'v', as.character(s.MODEL))
set(pipeline.args, which( pipeline.args$stat=='bwerm.mu' ), 'v', as.character(bwerm.mu))
set(pipeline.args, which( pipeline.args$stat=='bwerm.sigma' ), 'v', as.character(bwerm.sigma))
set(pipeline.args, which( pipeline.args$stat=='index.starttime.mode' ), 'v', as.character(index.starttime.mode))
set(pipeline.args, which( pipeline.args$stat=='startseq.mode' ), 'v', as.character(startseq.mode))
print(pipeline.args)
# scenario Acute Low + ART Fast
infile.ind <- '150127_RUN001'
infile.trm <- '150127_RUN001'
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp150201'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind, label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm, label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
#system(file)
}, by='label']
}
#
# -m30 brings R2 down to 32%. that s seem OK to use.
#
if(0)
{
indir <- '/Users/Oliver/git/HPTN071sim/source/PANGEA.HIV.sim/inst/misc'
pipeline.args <- sim.regional.args( yr.start=1985, yr.end=2020, seed=42, s.MODEL='Prop2DiagB4I',
s.PREV.max=0.08, s.INTERVENTION.start=2015, s.INTERVENTION.mul= 2, s.ARCHIVAL.n=50,
epi.model='HPTN071', epi.dt=1/48, epi.import=0.05,
v.N0tau=1, v.r=2.851904, v.T50=-2,
wher.mu=log(0.00447743)-0.5^2/2, wher.sigma=0.5, bwerm.mu=log(0.002239075)-0.3^2/2, bwerm.sigma=0.3, er.gamma=4,
dbg.GTRparam=0, dbg.rER=0, index.starttime.mode='fix1955', startseq.mode='many', seqtime.mode='DUnif5')
# proposed standard run and control simulation
pipeline.vary <- data.table( seqtime.mode= c('DUnif5','AtDiag','AtDiag','AtDiag','AtTrm'),
s.MODEL= c('Prop2DiagB4I','Prop2DiagB4I','Prop2DiagB4I','Prop2DiagB4I','Prop2Untreated'),
startseq.mode= c('many','many','many','many','many'),
s.multi= c(1,1,2,4,1),
label= c('-mD5', '-mDI','-mDIs2x','-mDIs4x', '-mPU'))
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='seqtime.mode' ), 'v', as.character(seqtime.mode))
set(pipeline.args, which( pipeline.args$stat=='s.MODEL' ), 'v', as.character(s.MODEL))
set(pipeline.args, which( pipeline.args$stat=='startseq.mode' ), 'v', as.character(startseq.mode))
print(pipeline.args)
if(1)
{
# scenario A
infile.ind <- '150129_HPTN071_scA'
infile.trm <- '150129_HPTN071_scA'
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp150203-A'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
set(pipeline.args, which( pipeline.args$stat=='s.PREV.max' ), 'v', as.character(s.multi*0.073))
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
#system(file)
}
if(1)
{
# scenario B
infile.ind <- '150129_HPTN071_scB'
infile.trm <- '150129_HPTN071_scB'
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp150203-B'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
set(pipeline.args, which( pipeline.args$stat=='s.PREV.max' ), 'v', as.character(s.multi*0.084))
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
#system(file)
}
if(1)
{
# scenario C
infile.ind <- '150129_HPTN071_scC'
infile.trm <- '150129_HPTN071_scC'
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp150203-C'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
set(pipeline.args, which( pipeline.args$stat=='s.PREV.max' ), 'v', as.character(s.multi*0.075))
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
#system(file)
}
if(1)
{
# scenario D
infile.ind <- '150129_HPTN071_scD'
infile.trm <- '150129_HPTN071_scD'
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp150203-D'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
set(pipeline.args, which( pipeline.args$stat=='s.PREV.max' ), 'v', as.character(s.multi*0.08))
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
#system(file)
}
if(1)
{
# scenario E
infile.ind <- '150129_HPTN071_scE'
infile.trm <- '150129_HPTN071_scE'
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp150203-E'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
set(pipeline.args, which( pipeline.args$stat=='s.PREV.max' ), 'v', as.character(s.multi*0.078))
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
#system(file)
}
if(1)
{
# scenario F
infile.ind <- '150129_HPTN071_scF'
infile.trm <- '150129_HPTN071_scF'
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp150203-F'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
set(pipeline.args, which( pipeline.args$stat=='s.PREV.max' ), 'v', as.character(s.multi*0.083))
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
#system(file)
}
}, by='label']
}
#
# fixed seq sampling fractions etc
#
if(1)
{
indir <- '/Users/Oliver/git/HPTN071sim/source/PANGEA.HIV.sim/inst/misc'
pipeline.args <- sim.regional.args( yr.start=1985, yr.end=2020, seed=42, s.MODEL='Prop2DiagB4I', report.prop.recent=1.0,
s.PREV.max=NA, s.INTERVENTION.prop=NA, s.INTERVENTION.start=2015, s.INTERVENTION.mul= NA, s.ARCHIVAL.n=50,
epi.model='HPTN071', epi.dt=1/48, epi.import=0.05,
v.N0tau=1, v.r=2.851904, v.T50=-2,
wher.mu=log(0.00447743)-0.5^2/2, wher.sigma=0.5, bwerm.mu=log(0.002239075)-0.3^2/2, bwerm.sigma=0.3, er.gamma=4,
dbg.GTRparam=0, dbg.rER=0, index.starttime.mode='fix1955', startseq.mode='many', seqtime.mode='AtDiag')
# proposed standard run and control simulation
pipeline.vary <- data.table( seqtime.mode= c('AtDiag','AtDiag','AtDiag','AtDiag','AtDiag','AtDiag','AtDiag','AtDiag','AtDiag'),
s.MODEL= c('Prop2DiagB4I','Prop2DiagB4I','Prop2Untreated','Fixed2Prop','Fixed2Prop','Fixed2Prop','Fixed2Prop','Fixed2Prop','Fixed2Prop'),
startseq.mode= c('many','many','many','many','many','many','many','many','many'),
s.multi= c(1,2,1,1,1,1,2,1,1),
yr.end= c(2020,2020,2020,2020,2020,2020,2020,2020,2018),
epi.import= c(0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.2,0.05),
s.PREV.max.n= c(1600,1600,1600,1600,1600,1600,1600,1600,1280),
s.INTERVENTION.prop= c(NA,NA,NA,0.85,0.5,0.25,0.5,0.5,0.375),
s.INTERVENTION.mul= c(1,1,1,NA,NA,NA,NA,NA,NA),
label= c('-mDI','-mDIs2x','-mPU','-mFP85','-mFP50','-mFP25','-mFP50s2x','-mFP50c20','-mFP50e18'))
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='seqtime.mode' ), 'v', as.character(seqtime.mode))
set(pipeline.args, which( pipeline.args$stat=='s.MODEL' ), 'v', as.character(s.MODEL))
set(pipeline.args, which( pipeline.args$stat=='startseq.mode' ), 'v', as.character(startseq.mode))
set(pipeline.args, which( pipeline.args$stat=='s.INTERVENTION.mul' ), 'v', as.character(s.INTERVENTION.mul))
set(pipeline.args, which( pipeline.args$stat=='s.INTERVENTION.prop' ), 'v', as.character(s.INTERVENTION.prop))
set(pipeline.args, which( pipeline.args$stat=='epi.import' ), 'v', as.character(epi.import))
print(pipeline.args)
if(1)
{
# scenario A
infile.ind <- '150129_HPTN071_scA'
infile.trm <- '150129_HPTN071_scA'
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp150204-A'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
set(pipeline.args, which( pipeline.args$stat=='s.PREV.max.n' ), 'v', as.character(s.multi*s.PREV.max.n))
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
#system(file)
}
if(1)
{
# scenario B
infile.ind <- '150129_HPTN071_scB'
infile.trm <- '150129_HPTN071_scB'
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp150204-B'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
set(pipeline.args, which( pipeline.args$stat=='s.PREV.max.n' ), 'v', as.character(s.multi*s.PREV.max.n))
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
#system(file)
}
if(1)
{
# scenario C
infile.ind <- '150129_HPTN071_scC'
infile.trm <- '150129_HPTN071_scC'
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp150204-C'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
set(pipeline.args, which( pipeline.args$stat=='s.PREV.max.n' ), 'v', as.character(s.multi*s.PREV.max.n))
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
#system(file)
}
if(1)
{
# scenario D
infile.ind <- '150129_HPTN071_scD'
infile.trm <- '150129_HPTN071_scD'
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp150204-D'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
set(pipeline.args, which( pipeline.args$stat=='s.PREV.max.n' ), 'v', as.character(s.multi*s.PREV.max.n))
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
#system(file)
}
if(1)
{
# scenario E
infile.ind <- '150129_HPTN071_scE'
infile.trm <- '150129_HPTN071_scE'
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp150204-E'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
set(pipeline.args, which( pipeline.args$stat=='s.PREV.max.n' ), 'v', as.character(s.multi*s.PREV.max.n))
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
#system(file)
}
if(1)
{
# scenario F
infile.ind <- '150129_HPTN071_scF'
infile.trm <- '150129_HPTN071_scF'
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp150204-F'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
set(pipeline.args, which( pipeline.args$stat=='s.PREV.max.n' ), 'v', as.character(s.multi*s.PREV.max.n))
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
#system(file)
}
}, by='label']
}
#
# double check R2 -- now very low!
#
if(1)
{
indir <- '/Users/Oliver/git/HPTN071sim/source/PANGEA.HIV.sim/inst/misc'
pipeline.args <- sim.regional.args( yr.start=1985, yr.end=2020, seed=42, s.MODEL='Fixed2Prop', report.prop.recent=1.0,
s.PREV.max.n=1600, s.INTERVENTION.prop=0.5, s.INTERVENTION.start=2015, s.INTERVENTION.mul= NA, s.ARCHIVAL.n=50,
epi.model='HPTN071', epi.dt=1/48, epi.import=0.05, root.edge.fixed=0,
v.N0tau=1, v.r=2.851904, v.T50=-2,
wher.mu=log(0.00447743)-0.5^2/2, wher.sigma=0.5, bwerm.mu=log(0.002239075)-0.3^2/2, bwerm.sigma=0.3, er.gamma=4,
dbg.GTRparam=0, dbg.rER=0, index.starttime.mode='fix1955', startseq.mode='many', seqtime.mode='AtDiag')
# proposed standard run and control simulation
pipeline.vary <- data.table( wher.mu= c(log(0.00447743)-0.5^2/2,log(0.00447743)-0.5^2/2,log(0.00447743)-0.5^2/2,log(0.00447743)-0.3^2/2,log(0.00447743)-0.5^2/2,log(0.00447743)-0.5^2/2,log(0.00447743)-0.5^2/2,log(0.00447743)-0.3^2/2,log(0.002239075)-0.16^2/2,log(0.00447743)-0.5^2/2,log(0.00447743)-0.5^2/2,log(0.00447743)-0.5^2/2,log(0.00447743)-0.5^2/2),
wher.sigma= c(0.5,0.5,0.5,0.3,0.5,0.5,0.5,0.3, 0.16, 0.5, 0.5, 0.5, 0.5),
bwerm.mu= c(log(0.002239075)-0.3^2/2, log(0.002239075)-0.2^2/2, log(0.002239075)-0.16^2/2, log(0.002239075)-0.16^2/2,log(0.002239075)-0.3^2/2, log(0.002239075)-0.2^2/2, log(0.002239075)-0.16^2/2, log(0.002239075)-0.16^2/2, log(0.002239075)-0.16^2/2,log(0.002239075)-0.3^2/2, log(0.002239075)-0.2^2/2,log(0.002239075)-0.3^2/2, log(0.002239075)-0.2^2/2),
bwerm.sigma= c(0.3, 0.2, 0.16, 0.16,0.3, 0.2, 0.16, 0.16, 0.16, 0.3, 0.2, 0.3, 0.2),
startseq.mode= c('many','many','many','many','one','one','one','one','one','one','one','one','one'),
index.starttime.mode= c('fix1955','fix1955','fix1955','fix1955','fix1955','fix1955','fix1955','fix1955','fix1955','fix1965','fix1965','fix1972','fix1972'),
label= c('-bh30','-bh20','-bh16','-wh30','-obh30','-obh20','-obh16','-owh30','-owh16','-obh3065','-obh2065','-obh3072','-obh2072'))
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='wher.mu' ), 'v', as.character(wher.mu))
set(pipeline.args, which( pipeline.args$stat=='wher.sigma' ), 'v', as.character(wher.sigma))
set(pipeline.args, which( pipeline.args$stat=='bwerm.mu' ), 'v', as.character(bwerm.mu))
set(pipeline.args, which( pipeline.args$stat=='bwerm.sigma' ), 'v', as.character(bwerm.sigma))
set(pipeline.args, which( pipeline.args$stat=='startseq.mode' ), 'v', as.character(startseq.mode))
set(pipeline.args, which( pipeline.args$stat=='index.starttime.mode' ), 'v', as.character(index.starttime.mode))
print(pipeline.args)
if(1)
{
# scenario A
infile.ind <- '150129_HPTN071_scA'
infile.trm <- '150129_HPTN071_scA'
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp150206-A'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
#system(file)
}
if(1)
{
# scenario E
infile.ind <- '150129_HPTN071_scE'
infile.trm <- '150129_HPTN071_scE'
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp150206-E'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
#system(file)
}
}, by='label']
}
#
# high R2 runs -- these are used for the tree comparison
# for tree comparison TRAIN1: seed=42
# for tree comparison TRAIN2 and higher: seed=101
#
if(1)
{
indir <- '/Users/Oliver/git/HPTN071sim/source/PANGEA.HIV.sim/inst/misc'
pipeline.args <- sim.regional.args( yr.start=1985, yr.end=2020, seed=42, s.MODEL='Fixed2Prop', report.prop.recent=1.0,
s.PREV.max.n=1600, s.INTERVENTION.prop=0.25, s.INTERVENTION.start=2015, s.INTERVENTION.mul= NA, s.ARCHIVAL.n=50,
epi.model='HPTN071', epi.dt=1/48, epi.import=0.05, root.edge.fixed=1,
v.N0tau=1, v.r=2.851904, v.T50=-2,
wher.mu=log(0.002239075)-0.3^2/2, wher.sigma=0.3,
bwerm.mu=log(0.002239075)-0.3^2/2, bwerm.sigma=0.3, er.gamma=4,
dbg.GTRparam=1, dbg.rER=1, index.starttime.mode='fix1955', startseq.mode='many', seqtime.mode='AtDiag')
# proposed standard run and control simulation
pipeline.vary <- data.table( startseq.mode= c('many','one'),
label= c('-m111','-o111'))
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='startseq.mode' ), 'v', as.character(startseq.mode))
print(pipeline.args)
if(0)
{
# scenario F
infile.ind <- '150129_HPTN071_scF'
infile.trm <- '150129_HPTN071_scF'
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp150227-F'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
#system(file)
}
if(1)
{
# scenario E
infile.ind <- '150129_HPTN071_scE'
infile.trm <- '150129_HPTN071_scE'
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp150227-E'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
#system(file)
}
}, by='label']
}
}
##--------------------------------------------------------------------------------------------------------
## olli 17.11.16
##--------------------------------------------------------------------------------------------------------
project.PANGEA.treecomparison.runninggaps2.getfasta<- function()
{
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/running_gaps_simulations'
infiles <- data.table(FFA=list.files(indir, pattern='fa$', full.names=TRUE))
infiles <- subset(infiles, grepl('FULL',FFA))
outdir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/running_gaps_simulations2'
#
infiles[, DUMMY:= 1L]
infiles <- merge(infiles, as.data.table(expand.grid(DUMMY=1L, EXCL=seq(0.5, 0.9, 0.1))), by='DUMMY', allow.cartesian=TRUE)
infiles[, DUMMY:=NULL]
infiles[, TRAIN_ID:= regmatches(FFA,regexpr('TRAIN[0-9]+', FFA))]
# read each file, rm taxa with more than z missing sites
# and write to file
invisible(infiles[, {
#FFA <- '/Users/Oliver/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/running_gaps_simulations/150701_Regional_TRAIN260_FULL_SIMULATED.fa'
#EXCL <- .9
#TRAIN_ID<- 'TRAIN260'
cat('\nTRAIN_ID', TRAIN_ID[1])
sq <- read.dna(FFA[1], format='fa')
tmp <- apply(as.character(sq),1,function(x) length(which(x=='?'))) / ncol(sq)
for(i in seq_along(EXCL))
{
z <- sq[names(tmp)[tmp<EXCL[i]],]
cat('\nnumber of taxa in reduced data set', nrow(z))
zz <- file.path(outdir, gsub(TRAIN_ID[i],paste(TRAIN_ID[i],100*EXCL[i],sep=''),basename(FFA)))
write.dna(z, file=zz, format='fa', colsep='', nbcol=-1)
file.copy( gsub('\\.fa','_gene.txt',FFA), gsub('\\.fa','_gene.txt',zz))
}
NULL
}, by=c('FFA')])
}
##--------------------------------------------------------------------------------------------------------
## olli 17.11.16
##--------------------------------------------------------------------------------------------------------
project.PANGEA.treecomparison.runninggaps3.getfasta<- function()
{
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/running_gaps_simulations'
infiles <- data.table(FFA=list.files(indir, pattern='fa$', full.names=TRUE))
infiles <- subset(infiles, grepl('FULL',FFA))
outdir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/running_gaps_simulations3'
#
infiles[, DUMMY:= 1L]
#infiles <- merge(infiles, as.data.table(expand.grid(DUMMY=1L, EXCL=seq(0.2, 0.4, 0.2))), by='DUMMY', allow.cartesian=TRUE)
infiles <- merge(infiles, as.data.table(expand.grid(DUMMY=1L, EXCL=c(0.01, 0.05, 0.1, 0.2))), by='DUMMY', allow.cartesian=TRUE)
infiles[, DUMMY:=NULL]
infiles[, TRAIN_ID:= regmatches(FFA,regexpr('TRAIN[0-9]+', FFA))]
infiles <- subset(infiles, grepl('TRAIN2',TRAIN_ID))
# read each file, rm taxa with more than z missing sites
# and write to file
invisible(infiles[, {
#FFA <- '/Users/Oliver/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/running_gaps_simulations/150701_Regional_TRAIN260_FULL_SIMULATED.fa'
#EXCL <- .05; TRAIN_ID<- 'TRAIN260'
cat('\nTRAIN_ID', TRAIN_ID[1])
sq <- read.dna(FFA[1], format='fa')
tmp <- data.table(SITE=seq_len(ncol(sq)), UNASS_P=apply(as.character(sq),2,function(x) length(which(x=='?'))) / nrow(sq))
for(i in seq_along(EXCL))
{
setkey(tmp, UNASS_P)
#tmp[, KEEP:= UNASS_P<EXCL[i]]
tmp[, KEEP:=FALSE]
set(tmp, floor(EXCL[i]*nrow(tmp)):nrow(tmp),'KEEP',TRUE)
setkey(tmp, SITE)
tmp[, SITE_NEW:= NA_real_]
set(tmp, tmp[, which(KEEP)], 'SITE_NEW', tmp[, as.double(seq_along(which(KEEP)))])
# make fasta file
z <- sq[, tmp$KEEP]
cat('\nnumber of columns in reduced data set', ncol(z))
#zz <- file.path(outdir, gsub(TRAIN_ID[i],paste(TRAIN_ID[i],'_EXCLSITES',100*EXCL[i],sep=''),basename(FFA)))
zz <- file.path(outdir, gsub(TRAIN_ID[i],paste(TRAIN_ID[i],'_EXCLWORST',100*EXCL[i],sep=''),basename(FFA)))
write.dna(z, file=zz, format='fa', colsep='', nbcol=-1)
# make partitions
z <- data.table( LOWER= c( subset(tmp, SITE<=1477)[, min(SITE_NEW, na.rm=1)], subset(tmp, SITE>1477 & SITE<=4290)[, min(SITE_NEW, na.rm=1)], subset(tmp, SITE>4290)[, min(SITE_NEW, na.rm=1)]),
UPPER= c(subset(tmp, SITE<=1477)[, max(SITE_NEW, na.rm=1)], subset(tmp, SITE>1477 & SITE<=4290)[, max(SITE_NEW, na.rm=1)], subset(tmp, SITE>4290)[, max(SITE_NEW, na.rm=1)]))
z[, LEN:= UPPER-LOWER+1L]
z <- subset(z, is.finite(LEN))
for(j in rev(seq_len(nrow(z))))
if(j>1 & z[j,LEN]<200)
{
set(z, j-1L, 'UPPER', z[j,UPPER])
set(z, j-1L, 'LEN', z[j-1L,LEN]+z[j,LEN])
set(z, j, 'LEN', NA_real_)
}
z <- subset(z, !is.na(LEN))
if(nrow(z)>1)
{
z <- paste( z[, list(PART=paste('DNA, p',LOWER,' = ',LOWER,'-',UPPER,'\n',sep='')), by='LOWER'][, PART], collapse='')
cat(z, file=gsub('\\.fa','_gene.txt',zz))
}
}
NULL
}, by=c('FFA')])
# check len
tmp <- data.table(F=list.files(outdir, pattern='fa$', full.names=TRUE))
tmp <- subset(tmp, grepl('EXCLSITE',F))
tmp <- tmp[,{
z<- read.dna(F, format='fa')
list(COL=ncol(z))
},by='F']
tmp[, EXCL:= tmp[, gsub('EXCLSITES','',regmatches(F, regexpr('EXCLSITES[0-9]+',F)))]]
tmp[, RUNGAPS:= tmp[, as.numeric(gsub('TRAIN[0-9]','',regmatches(F, regexpr('TRAIN[0-9]+',F))))/100]]
ggplot(tmp, aes(x=RUNGAPS, colour=EXCL, y=COL)) + geom_line() + geom_point()
}
##--------------------------------------------------------------------------------------------------------
## olli 22.06.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.treecomparison.simulation.pipeline<- function()
{
if(1)
{
library(PANGEA.HIV.sim)
indir <- '/Users/Oliver/git/HPTN071sim/source/PANGEA.HIV.sim/inst/misc'
pipeline.args <- sim.regional.args( yr.start=1985, yr.end=2020, seed=42, s.MODEL='Fixed2Prop', report.prop.recent=1.0,
s.PREV.max.n=1600, s.INTERVENTION.prop=0.25, s.INTERVENTION.start=2015, s.INTERVENTION.mul= NA, s.ARCHIVAL.n=50,
epi.model='HPTN071', epi.dt=1/48, epi.import=0.05, root.edge.fixed=1,
v.N0tau=1, v.r=2.851904, v.T50=-2,
wher.mu=log(0.002239075)-0.3^2/2, wher.sigma=0.3,
bwerm.mu=log(0.002239075)-0.3^2/2, bwerm.sigma=0.3, er.gamma=4,
dbg.GTRparam=1, dbg.rER=1, index.starttime.mode='fix1955', startseq.mode='many', seqtime.mode='AtDiag')
# proposed standard run and control simulation
pipeline.vary <- data.table( startseq.mode= c('many','one'),
label= c('-m111','-o111'))
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='startseq.mode' ), 'v', as.character(startseq.mode))
print(pipeline.args)
if(1)
{
# scenario F
infile.ind <- '150129_HPTN071_scF'
infile.trm <- '150129_HPTN071_scF'
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp150227-F'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
#system(file)
}
if(1)
{
# scenario E
infile.ind <- '150129_HPTN071_scE'
infile.trm <- '150129_HPTN071_scE'
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp150227-E'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
#system(file)
}
}, by='label']
#
# same as above but no heterogeneity on codon positions
#
pipeline.args <- sim.regional.args( seed=42,
s.MODEL='Fixed2Prop', s.PREV.max.n=1600, s.INTERVENTION.prop=0.25, s.INTERVENTION.start=2015, s.INTERVENTION.mul= NA, s.ARCHIVAL.n=50,
report.prop.recent=1.0, epi.acute='low', epi.intervention='none',
epi.import=0.05, root.edge.fixed=1,
#wher.mu=log(0.002239075)-0.3^2/2, wher.sigma=0.3, bwerm.mu=log(0.002239075)-0.3^2/2, bwerm.sigma=0.3, er.gamma=4,
wher.mu=log(0.002239075), wher.sigma=0, bwerm.mu=log(0.002239075), bwerm.sigma=0,
er.gamma=4, er.gtr='GTR_POL_CP1',
dbg.GTRparam=1, dbg.rER=1, index.starttime.mode='fix1955', startseq.mode='one', seqtime.mode='AtDiag')
pipeline.vary <- data.table( label='2' )
dummy <- pipeline.vary[, {
tmpdir <- '/Users/Oliver/duke/tmp/150701_Regional_GTRFIX' #low acute
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
file <- sim.regional(tmpdir, pipeline.args=pipeline.args)
cat(file)
#stop()
system(file)
}, by='label']
#
# same as above but 60% taxon sampling of all PLWH 2020
#
pipeline.args <- sim.regional.args( seed=42,
s.MODEL='Prop2SuggestedSampling', s.PREV.max=0.99, s.INTERVENTION.prop=0.25, s.INTERVENTION.start=2015, s.INTERVENTION.mul= NA, s.ARCHIVAL.n=50,
report.prop.recent=1.0, epi.acute='low', epi.intervention='none',
epi.import=0.05, root.edge.fixed=1,
wher.mu=log(0.002239075)-0.3^2/2, wher.sigma=0.3, bwerm.mu=log(0.002239075)-0.3^2/2, bwerm.sigma=0.3, er.gtr='GTR_VARIABLE',
#wher.mu=log(0.002239075), wher.sigma=0, bwerm.mu=log(0.002239075), bwerm.sigma=0, er.gtr='GTR_POL_CP1',
er.gamma=4, dbg.GTRparam=1, dbg.rER=1, index.starttime.mode='fix1955', startseq.mode='one', seqtime.mode='AtDiag')
pipeline.vary <- data.table( label='4' )
dummy <- pipeline.vary[, {
tmpdir <- '/Users/Oliver/duke/tmp/150701_Regional_TRAIN' #low acute
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
file <- sim.regional(tmpdir, pipeline.args=pipeline.args)
cat(file)
#stop()
system(file)
}, by='label']
#
# same as above but 30% taxon sampling of all PLWH 2020
#
pipeline.args <- sim.regional.args( seed=42,
s.MODEL='Prop2SuggestedSampling', s.PREV.max=NA, s.INTERVENTION.prop=0.25, s.INTERVENTION.start=2015, s.INTERVENTION.mul= NA, s.ARCHIVAL.n=50,
report.prop.recent=1.0, epi.acute='low', epi.intervention='none',
epi.import=0.05, root.edge.fixed=1,
wher.mu=log(0.002239075)-0.3^2/2, wher.sigma=0.3, bwerm.mu=log(0.002239075)-0.3^2/2, bwerm.sigma=0.3, er.gtr='GTR_VARIABLE',
#wher.mu=log(0.002239075), wher.sigma=0, bwerm.mu=log(0.002239075), bwerm.sigma=0, er.gtr='GTR_POL_CP1',
er.gamma=4, dbg.GTRparam=1, dbg.rER=1, index.starttime.mode='fix1955', startseq.mode='one', seqtime.mode='AtDiag')
pipeline.vary <- data.table( s.PREV.max=c(0.5, 0.25), label=c('7','8') )
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='s.PREV.max' ), 'v', as.character(s.PREV.max))
tmpdir <- '/Users/Oliver/duke/tmp/150701_Regional_TRAIN' #low acute
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
file <- sim.regional(tmpdir, pipeline.args=pipeline.args)
cat(file)
#stop()
system(file)
}, by='label']
}
}
##--------------------------------------------------------------------------------------------------------
## olli 22.06.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.treecomparison.gaps<- function()
{
#
# prepare gappy PANGEA files at different coverage
#
if(0)
{
indir.refalgn <- '/Users/Oliver/Dropbox (SPH Imperial College)/PANGEA_data/PANGEAconsensuses_2015-09_Imperial/PipelineInput'
infile.refalgn <- 'HIV1_COM_2012_genome_DNA_WithExtraA1UG.fasta'
indir.basefreq <- '/Users/Oliver/Dropbox (SPH Imperial College)/PANGEA_data/PANGEAconsensuses_2015-09_Imperial/PANGEA_HIV_n4562_Imperial_v150908_BaseFrequencies'
outdir <- '/Users/Oliver/git/HPTN071sim/treecomparison/PANGEAcov'
tmp <- c(10,20,30,40,50)
tmp <- c(70,90,100)
tmp <- c(200)
tmp <- 1
invisible(sapply(tmp, function(callconsensus.minc)
{
outfile <- '161228_PANGEAGlobal4282'
outfile <- paste(outfile, '_C', callconsensus.minc, '.fa', sep='')
seq <- PANGEA.align.from.basefreq(indir.refalgn, infile.refalgn, indir.basefreq, callconsensus.minc, outdir, verbose=1)
write.dna(seq, file=paste(outdir, '/', outfile, sep=''), format='fasta', colsep='', nbcol=-1)
save(seq, file=paste(outdir, '/', gsub('.fa','.R',outfile,fixed=TRUE), sep=''))
NULL
}))
# proportion of '?' calls
files <- list.files(outdir, pattern='PANGEAGlobal.*fa$')
ss <- lapply(files, function(x) read.dna(paste(outdir,'/',x,sep=''), format='fasta'))
dfc <- lapply(seq_along(ss), function(i)
{
tmp <- as.character(ss[[i]])
data.table(FILE=files[i], SEQ=rownames(tmp), NSEQ=apply(tmp!='-', 1, sum), NOCOV=apply(tmp=='?' | tmp=='n', 1, sum))
})
dfc <- do.call('rbind', dfc)
dfc[, COVERAGE:= dfc[, regmatches(FILE,regexpr('C[0-9]+',FILE))]]
set(dfc, NULL, 'COVERAGE', dfc[,as.numeric(substring(COVERAGE,2))])
dfc[, SITE:= dfc[, substr(SEQ, 6,7)]]
dfci <- dfc[, list(CALLED= median( (NSEQ-NOCOV)/NSEQ) ), by=c('SITE','COVERAGE')]
setkey(dfci, SITE, COVERAGE)
save(dfci, dfc, file='~/git/HPTN071sim/treecomparison/withgaps/150623_HPTN071_CoverageComparison.R')
ggplot(dfci, aes(x=COVERAGE, y=CALLED*100, colour=SITE, group=SITE)) +
geom_line() + geom_point() +
scale_x_continuous(expand=c(0,0)) +
scale_y_continuous(breaks=seq(0,100,20),limits=c(0,100),expand=c(0,0)) +
labs(x='majority coverage\n(minimum number of aligned short read calls that agree)', y='Non-?N calls out of ACGTRWS?N calls\n(median %)') + theme_bw()
ggsave(file='~/git/HPTN071sim/treecomparison/withgaps/150623_HPTN071_CoverageComparison.pdf', w=6, h=6)
}
#
# select largest cluster & create partition files for regional
#
if(0)
{
indir.nogaps <- '/Users/Oliver/git/HPTN071sim/treecomparison/nogaps'
indir.wgaps <- '/Users/Oliver/git/HPTN071sim/treecomparison/withgaps_150701'
infiles.wgaps <- list.files(indir.wgaps, pattern='^150623_HPTN071_TRAIN.*[0-9]\\.fa$')
# create tree for largest cluster
infiles.nogapsR <- list.files(indir.nogaps, pattern='DATEDTREE\\.newick')
for(infile.nogapsR in infiles.nogapsR)
{
#infile.nogapsR <- infiles.nogapsR[1]
ph <- read.tree(paste(indir.nogaps,'/',infile.nogapsR,sep=''))
load(paste(indir.nogaps, '/', gsub('DATEDTREE.*','SIMULATED_INTERNAL.R',infile.nogapsR), sep=''))
df.clu <- subset(df.inds, !is.na(TIME_SEQ))[, list(IDPOP=IDPOP, CLUN=length(IDPOP)), by='IDCLU']
df.clu <- subset(df.clu, max(CLUN)==CLUN)
tmp <- paste('IDPOP_',df.clu[1,IDPOP],'|',sep='')
tmp <- which(sapply(seq_along(ph), function(i) any(grepl(tmp,ph[[i]]$tip.label,fixed=TRUE)) ))
stopifnot(tmp>0)
write.tree(ph[[tmp]],paste(indir.nogaps,'/',gsub('\\.newick','_lrgstclu\\.newick',infile.nogapsR),sep=''))
}
# create partition files
for(infile.wgaps in infiles.wgaps)
{
infile.nogapsR <- gsub('[^_]*\\.fa','INTERNAL.R',infile.wgaps)
# select large transmission cluster
load(paste(indir.nogaps, '/', infile.nogapsR, sep=''))
df.clu <- subset(df.inds, !is.na(TIME_SEQ))[, list(IDPOP=IDPOP, CLUN=length(IDPOP)), by='IDCLU']
df.clu <- subset(df.clu, max(CLUN)==CLUN)
tmp <- read.dna(file=paste(indir.wgaps, '/', infile.wgaps, sep=''), format='fasta')
dfs <- data.table(LABEL= rownames(tmp))
dfs[, IDPOP:= as.integer(substring(sapply(strsplit(LABEL,'|',fixed=TRUE), '[[', 1),7))]
dfs <- merge(df.clu, dfs, by='IDPOP')
seq <- tmp[dfs[,LABEL],]
file <- paste(indir.wgaps, '/', gsub('\\.','_lrgstclu\\.',infile.wgaps), sep='')
write.dna(seq, file=file, format='fasta', colsep='', nbcol=-1)
save(seq, file=gsub('\\.fa.*','\\.R',file))
# determine partition lengths
# pol start
infile.nogaps <- list.files(indir.nogaps, pattern='pol\\.fa$')
infile.nogaps <- infile.nogaps[ grepl(gsub('_SIMULATED.*','',infile.wgaps),infile.nogaps) ]
stopifnot(length(infile.nogaps)==1)
tmp <- read.dna(paste(indir.nogaps, '/', infile.nogaps, sep=''), format='fasta')[rownames(seq)[1],]
stopifnot(nrow(tmp)==1)
key <- paste(as.character(tmp)[1,1:8], collapse='')
for(i in seq_len(nrow(seq)))
{
pol.start <- regexpr(key, paste(as.character(seq[i,]), collapse=''))
if(pol.start>1)
break
}
stopifnot(pol.start>1)
# env start
infile.nogaps <- list.files(indir.nogaps, pattern='env\\.fa$')
infile.nogaps <- infile.nogaps[ grepl(gsub('_SIMULATED.*','',infile.wgaps),infile.nogaps) ]
stopifnot(length(infile.nogaps)==1)
tmp <- read.dna(paste(indir.nogaps, '/', infile.nogaps, sep=''), format='fasta')[rownames(seq)[1],]
stopifnot(nrow(tmp)==1)
key <- paste(as.character(tmp)[1,1:8], collapse='')
for(i in seq_len(nrow(seq)))
{
env.start <- regexpr(key, paste(as.character(seq[i,]), collapse=''))
if(env.start>1)
break
}
stopifnot(env.start>1)
# write codon partition file
tmp <- paste(c( 'DNA, gagcodon1 = 1-',pol.start-1,'\\3\n',
'DNA, gagcodon2 = 2-',pol.start-1,'\\3\n',
'DNA, gagcodon3 = 3-',pol.start-1,'\\3\n',
'DNA, polcodon1 = ',pol.start,'-',env.start-1,'\\3\n',
'DNA, polcodon2 = ',pol.start+1,'-',env.start-1,'\\3\n',
'DNA, polcodon3 = ',pol.start+2,'-',env.start-1,'\\3\n',
'DNA, envcodon1 = ',env.start,'-',ncol(seq),'\\3\n',
'DNA, envcodon2 = ',env.start+1,'-',ncol(seq),'\\3\n',
'DNA, envcodon3 = ',env.start+2,'-',ncol(seq),'\\3\n'), collapse='' )
infile.partition <- gsub('\\.fa.*','_codon\\.txt',basename(file))
cat(file=paste(indir.wgaps,'/',infile.partition,sep=''), tmp)
}
}
#
# create RAxML tree with -fo and codon partitions on mock data
#
if(0)
{
#
# run ExamML with partition
#
indir <- indir.wgaps
signat.in <- "ZAC5_lrgstclu"
signat.out <- "ZAC5_lrgstclu"
infile <- gsub(paste('_',signat.in,sep=''),'',gsub('\\..*','',basename(file)))
args.parser <- paste("-m DNA -q",infile.partition)
cmd <- hivc.cmd.examl.bootstrap(indir, infile, signat.in, signat.out, bs.from=0, bs.to=0, prog.bscreate=PR.EXAML.BSCREATE, prog.parser= PR.EXAML.PARSER, prog.starttree= PR.EXAML.STARTTREE, prog.examl=PR.EXAML.EXAML, opt.bootstrap.by="codon", args.parser=args.parser, args.examl="-m GAMMA -D", prog.supportadder=PR.EXAML.BS, tmpdir.prefix="examl")
# run RAxML
# change seed several times to get replicate
# compare trees to true tree tmw
# set up big trees on cluster for best performing RF
cmd <- paste('raxmlHPC-AVX -s ',basename(file),'-m GTRGAMMAI -p42 -n ',gsub('fa','newick',basename(file)), sep='')
cmd <- paste('raxmlHPC-AVX -s ',basename(file),' -q ', paste('TMP_',gsub('fa','txt',basename(file)),sep=''),' -m GTRGAMMAI -p42 -n ',gsub('\\.fa','_codon\\.newick',basename(file)), sep='')
cmd <- paste('raxmlHPC-AVX -f o -s ',basename(file),' -q ', paste('TMP_',gsub('fa','txt',basename(file)),sep=''),' -m GTRGAMMAI -p42 -n ',gsub('\\.fa','_codon_hcold\\.newick',basename(file)), sep='')
cat(cmd)
}
# duplicate ExaML files
if(0)
{
indir.wgaps <- '/Users/Oliver/git/HPTN071sim/treecomparison/withgapstrees'
infiles <- data.table(FILE=list.files(indir.wgaps, pattern='\\.R$'))
infiles[, {
file.copy( paste(indir.wgaps,'/',FILE,sep=''), paste(indir.wgaps,'/',gsub('Apr15','Apr15_NOQ',gsub('SIMULATED','SIMULATED_NOQ',FILE)),sep=''))
}, by='FILE']
}
# duplicate ExaML files: remove taxa with a high % of '?'
if(0)
{
indir.wgaps <- '/Users/Oliver/git/HPTN071sim/treecomparison/withgapstrees'
infiles <- data.table(FILE=list.files(indir.wgaps, pattern='\\.R$'))
infiles <- subset(infiles, !grepl('NOQ',FILE) & grepl('BWC5_|UGC5_',FILE))
invisible(lapply(infiles[, FILE],function(file)
{
load( paste(indir.wgaps,'/',file,sep='') )
dc <- data.table(LABEL=rownames(seq), CVG=seq.length(seq, exclude=c('-','?')) / seq.length(seq, exclude=c('-')))
# keep taxa with ACGT >10%, >30%, >60%, >80%
dc <- do.call('rbind',lapply(c(0.1,0.3,0.6,0.8), function(cut)
{
tmp <- subset(dc, CVG>=cut)
tmp[, CUT:=cut]
tmp
}))
# for each cut, create extra data set
seqc <- copy(seq)
invisible(dc[,{
seq <- seqc[LABEL,]
cat(paste('\nNumber of taxa in reduced alignment n=',nrow(seq)))
save(seq, file=paste(indir.wgaps,'/',gsub('SIMULATED',paste('SIMULATED_CUT',CUT*100,sep=''),file),sep='') )
}, by='CUT'])
NULL
}))
infiles <- data.table(FILE=list.files(indir.wgaps, pattern='\\.R$'))
infiles <- subset(infiles, grepl('CUT',FILE))
infiles[, PARTITION:= gsub('\\.R','_codon.txt',FILE)]
infiles[, {
file.copy( paste(indir.wgaps,'/',gsub('_CUT[0-9]+','',PARTITION),sep=''), paste(indir.wgaps,'/',PARTITION,sep=''))
}, by='FILE']
}
#
# create ExaML script files
#
if(0)
{
indir.wgaps <- '/Users/Oliver/git/HPTN071sim/treecomparison/withgapstrees'
infiles <- data.table(FILE=list.files(indir.wgaps, pattern='\\.R$'))
infiles <- data.table(FILE=list.files(indir.wgaps, pattern='\\.R$'))
infiles[,SIGNAT:=infiles[, regmatches(FILE,regexpr('BWC.*|UGC.*', FILE))]]
tmp <- infiles[, list(BASE=gsub(paste('_',SIGNAT,sep=''),'',FILE)), by='FILE']
infiles <- merge(infiles, tmp, by='FILE')
set(infiles, NULL, 'SIGNAT', infiles[, gsub('\\.R','',SIGNAT)])
infiles[, PARTITION:= gsub('\\.R','_codon.txt',FILE)]
bs.from <- 0
bs.to <- 0
bs.n <- 1
#infiles <- subset(infiles, grepl('NOQ',FILE) & grepl('BWC5_|UGC5_',FILE))
infiles <- subset(infiles, grepl('CUT',FILE))
#infiles <- subset(infiles, grepl('BWC0',SIGNAT))
invisible(infiles[, {
infile <- BASE
signat.in <- signat.out <- SIGNAT
args.parser <- paste("-m DNA -q",PARTITION)
args.parser <- paste("-m DNA")
args.parser <- paste("-m DNA")
args.examl <- "-f o -m GAMMA"
#args.examl <- "-f k -m GAMMA -M"
cmd <- hivc.cmd.examl.bootstrap(indir.wgaps, infile, signat.in, signat.out, bs.from=bs.from, bs.to=bs.to, prog.bscreate=PR.EXAML.BSCREATE, prog.parser= PR.EXAML.PARSER, prog.starttree= PR.EXAML.STARTTREE, prog.examl=PR.EXAML.EXAML, opt.bootstrap.by="codon", args.parser=args.parser, args.examl=args.examl, prog.supportadder=PR.EXAML.BS, tmpdir.prefix="examl")
invisible(lapply(cmd, function(x)
{
x <- hivc.cmd.hpcwrapper(x, hpc.walltime=21, hpc.q="pqeelab", hpc.mem="950mb", hpc.nproc=1)
signat <- paste(strsplit(date(),split=' ')[[1]],collapse='_',sep='')
outfile <- paste("ex",signat,sep='.')
#cat(x)
hivc.cmd.hpccaller(outdir, outfile, x)
Sys.sleep(1)
}))
NULL
}, by='FILE'])
}
}
##--------------------------------------------------------------------------------------------------------
## olli 27.06.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.treecomparison.gaps.RobinsonFould.ByMajorityCoverageThreshold<- function()
{
#
# compare ExaML trees to true topology
#
if(1)
{
require(phangorn)
indir.wgaps <- '/Users/Oliver/git/HPTN071sim/treecomparison/withgapstrees_150630'
indir.nogaps <- '/Users/Oliver/git/HPTN071sim/treecomparison/nogaps_150630'
files <- data.table(SIMFILE=list.files(indir.wgaps, pattern='ExaML_result.*finaltree\\.[0-9]+', recursive=TRUE))
files[, TRUEFILE:= gsub('SIMULATED.*','DATEDTREE_lrgstclu.newick',gsub('ExaML_result\\.','',basename(SIMFILE)))]
set(files, NULL, 'TRUEFILE', files[, gsub('Vill_99_Apr15.*','Vill_99_Apr15.newick',TRUEFILE)])
files[, CNFG:= regmatches(basename(SIMFILE), regexpr('BWC[0-9]+|UGC[0-9]+',basename(SIMFILE)))]
files[, PARTITION:= 'None']
set(files, files[, which(!grepl('NOQ', basename(SIMFILE)))], 'PARTITION', '9ByGeneCodon')
files[, EXCLSEQ:= 'None']
tmp <- files[, which(grepl('CUT[0-9]+', basename(SIMFILE)))]
set(files, tmp, 'EXCLSEQ', files[tmp, regmatches(basename(SIMFILE), regexpr('CUT[0-9]+',basename(SIMFILE)))])
files[, DT:= regmatches(basename(SIMFILE), regexpr('TRAIN[0-9]+|Vill_99',basename(SIMFILE)))]
files[, REP:= as.numeric(regmatches(basename(SIMFILE), regexpr('[0-9]+$',basename(SIMFILE))))]
files[, CVRG:= as.numeric(regmatches(CNFG, regexpr('[0-9]+',CNFG)))]
files[, SITE:= regmatches(CNFG, regexpr('BW|UG',CNFG))]
files <- subset(files, PARTITION=='9ByGeneCodon' & EXCLSEQ=='None')
#robinson fould distance
#SIMFILE<- subset(files, SIMFILE=="Vill_99_Apr15_examlout_BWC0/ExaML_result.Vill_99_Apr15_BWC0.finaltree.000")[, SIMFILE]
#TRUEFILE<- subset(files, SIMFILE=="Vill_99_Apr15_examlout_BWC0/ExaML_result.Vill_99_Apr15_BWC0.finaltree.000")[, TRUEFILE]
drf <- files[,{
#print(SIMFILE)
stree <- unroot(read.tree(paste(indir.wgaps,'/',SIMFILE,sep='')))
otree <- unroot(read.tree(paste(indir.nogaps,'/',TRUEFILE,sep='')))
if(!is.binary.tree(stree))
stree <- multi2di(stree)
#https://groups.google.com/forum/#!topic/raxml/JgvxgknTeqw
#normalize with 2n-6
rf <- RF.dist(otree, stree, check.labels = TRUE)
list(RF=rf, NRF=rf/(2*Ntip(otree)-6))
}, by='SIMFILE']
drf <- merge(files, drf, by='SIMFILE')
#plot trees in one pdf
invisible(subset(files, REP<15)[,{
strees <- lapply(SIMFILE, function(x) read.tree(paste(indir.wgaps,'/',x,sep='')) )
otree <- read.tree(paste(indir.nogaps,'/',TRUEFILE[1],sep=''))
if(length(strees)+1==16)
{
cat(paste('\nplotting',DT,CNFG))
pdf(file=paste(indir.wgaps,'/',gsub('finaltree.[0-9]+','CMPTREES.pdf',SIMFILE[1]),sep=''), w=8, h=8)
def.par <- par(no.readonly = TRUE)
par(mar=c(0.5,0.5,0.5,0.5))
layout( matrix(c(1:16), 4, 4, byrow = TRUE) )
plot(otree, show.tip.label=FALSE,edge.color="red")
invisible(sapply(strees, function(x) plot(x, show.tip.label=FALSE)))
par(def.par)
dev.off()
}
NULL
}, by=c('DT','CNFG')])
#plot RF distance
ggplot(drf, aes(x=CVRG, y=NRF*100, colour=SITE)) +
geom_point() +
scale_x_continuous(expand=c(0.01,0.01), breaks=seq(0,200,20)) +
scale_y_continuous(breaks=seq(0,100,20),limits=c(0,100),expand=c(0,0)) +
labs(x='majority coverage\n(minimum number of aligned short read calls that agree)', y='normalized RF distance btw est and true trees\n(RF/(2*taxa-6)') +
facet_grid(~DT) +
theme_bw() + theme(panel.margin.x = unit(0.8, "lines"))
ggsave(file= paste(indir.wgaps,'/150623_ExaML_Codon_RF_pts.pdf',sep=''), w=9, h=4)
ggplot(drf, aes(x=factor(CVRG), y=NRF*100, colour=SITE)) +
geom_point(colour='grey50') + geom_boxplot() +
scale_x_discrete(expand=c(0.01,0.01)) +
scale_y_continuous(breaks=seq(0,100,20),limits=c(0,100),expand=c(0,0)) +
labs(x='majority coverage\n(minimum number of aligned short read calls that agree)', y='normalized RF distance btw est and true trees\n(RF/(2*taxa-6)') +
facet_grid(~DT) +
theme_bw() + theme(panel.margin.x = unit(0.8, "lines"))
ggsave(file= paste(indir.wgaps,'/150630_ExaML_Codon_RF_box.pdf',sep=''), w=12, h=4)
}
}
##--------------------------------------------------------------------------------------------------------
## olli 30.06.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.treecomparison.gaps.RobinsonFould.ByPartitionScheme<- function()
{
#
# compare ExaML trees to true topology
#
require(phangorn)
indir.wgaps <- '/Users/Oliver/git/HPTN071sim/treecomparison/withgapstrees_150630'
indir.nogaps <- '/Users/Oliver/git/HPTN071sim/treecomparison/nogaps'
files <- data.table(SIMFILE=list.files(indir.wgaps, pattern='ExaML_result.*finaltree\\.[0-9]+', recursive=TRUE))
files[, TRUEFILE:= gsub('SIMULATED.*','DATEDTREE_lrgstclu.newick',gsub('ExaML_result\\.','',basename(SIMFILE)))]
set(files, NULL, 'TRUEFILE', files[, gsub('Vill_99_Apr15.*','Vill_99_Apr15.newick',TRUEFILE)])
files[, CNFG:= regmatches(basename(SIMFILE), regexpr('BWC[0-9]+|UGC[0-9]+',basename(SIMFILE)))]
files[, PARTITION:= 'None']
set(files, files[, which(!grepl('NOQ', basename(SIMFILE)))], 'PARTITION', '9ByGeneCodon')
files[, EXCLSEQ:= 'None']
tmp <- files[, which(grepl('CUT[0-9]+', basename(SIMFILE)))]
set(files, tmp, 'EXCLSEQ', files[tmp, regmatches(basename(SIMFILE), regexpr('CUT[0-9]+',basename(SIMFILE)))])
files[, DT:= regmatches(basename(SIMFILE), regexpr('TRAIN[0-9]+|Vill_99',basename(SIMFILE)))]
files[, REP:= as.numeric(regmatches(basename(SIMFILE), regexpr('[0-9]+$',basename(SIMFILE))))]
files[, CVRG:= as.numeric(regmatches(CNFG, regexpr('[0-9]+',CNFG)))]
files[, SITE:= regmatches(CNFG, regexpr('BW|UG',CNFG))]
filess <- subset(files, (CNFG=='BWC5' | CNFG=='UGC5') & EXCLSEQ=='None' & DT=='TRAIN2')
#robinson fould distance
drf <- filess[,{
#print(SIMFILE)
stree <- unroot(read.tree(paste(indir.wgaps,'/',SIMFILE,sep='')))
otree <- unroot(read.tree(paste(indir.nogaps,'/',TRUEFILE,sep='')))
if(!is.binary.tree(stree))
stree <- multi2di(stree)
#https://groups.google.com/forum/#!topic/raxml/JgvxgknTeqw
#normalize with 2n-6
rf <- RF.dist(otree, stree, check.labels = TRUE)
list(RF=rf, NRF=rf/(2*Ntip(otree)-6))
}, by='SIMFILE']
drf <- merge(filess, drf, by='SIMFILE')
#plot RF distance
ggplot(drf, aes(x=factor(PARTITION), y=NRF*100, colour=SITE)) +
geom_point(colour='grey50') + geom_boxplot() +
scale_x_discrete(expand=c(0.01,0.01)) +
scale_y_continuous(breaks=seq(0,100,20),limits=c(0,100),expand=c(0,0)) +
labs(x='partition scheme used', y='normalized RF distance btw est and true trees\n(RF/(2*taxa-6)') +
facet_grid(~DT) +
theme_bw() + theme(panel.margin.x = unit(0.8, "lines"))
ggsave(file= paste(indir.wgaps,'/150630_ExaML_PartitionNoneOrCodon_RF_box.pdf',sep=''), w=5, h=4)
}
##--------------------------------------------------------------------------------------------------------
## olli 01.07.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.treecomparison.gaps.simulate.rungapdatasets<- function()
{
if(1)
{
indir.simu <- '/Users/Oliver/git/HPTN071sim/treecomparison/nogaps_161121'
indir.gap <- '~/git/HPTN071sim/treecomparison/PANGEAcov'
infile.gap <- '150623_PANGEAGlobal2681_C5.fa'
outdir <- '/Users/Oliver/git/HPTN071sim/treecomparison/withgaps_161121'
gap.symbol <- '?'
gap.seed <- 42
infile.gap <- '151113_PANGEAGlobal4562_C10.fa'
outfile.cov <- regmatches(infile.gap,regexpr('C[0-9]+',basename(infile.gap)))
infile.simu <- '161121_Regional_TRAIN6_SIMULATED'
infile.simu <- '161121_Regional_TRAIN7_SIMULATED'
infile.simu <- '161121_Regional_TRAIN8_SIMULATED'
# align and rbind simulated and real sequences, rm gap rows and trailing gap cols too:
ms <- PANGEA.add.gaps.merge.and.maintain.triplets(indir.simu, indir.gap, infile.simu, infile.gap, verbose=1)
write.dna(ms, file=paste(indir.simu,'/',gsub('_SIMULATED','_SIMULATED_RMGPS\\.fa',infile.simu),sep=''),format='fasta', colsep='', nbcol=-1)
# make allocations reproducible
set.seed(gap.seed)
# read chunks from selected gap country sequence (is NA, then read from all PANGEA seqs)
sq <- ms[ grepl('PG[0-9]+', rownames(ms)), ]
ch <- seq.get.gap.chunks(sq, gap.symbol=c('?','n'))
setnames(ch, 'TAXON', 'PANGEA_ID')
ch[, SITE:= gsub('PG[0-9]+-','',regmatches(PANGEA_ID, regexpr('PG[0-9]+-[A-Z]+',PANGEA_ID))) ]
for(gap.p in seq(0.02, 0.6, 0.01))
{
cat('FULL',gap.p)
# gap coverage per PANGEA id
chs <- PANGEA.add.gaps.allocate.chunks.to.sequences(ch, ms)
# select chunks that give actual coverage closest to requested coverage
tmp <- chs[which.min(abs(GAP_P-gap.p)), IDX_COARSE]
chs <- subset(chs, IDX_COARSE<=tmp)
# copy gaps into alignment
gs <- as.character(ms)
for(i in seq_len(nrow(chs)))
gs[ chs[i,COPY_TO], chs[i, seq.int(POS, len=REP)] ] <- gap.symbol
# select simulated sequences
gs <- gs[which(grepl('IDPOP|HOUSE|HXB2',rownames(gs))), ]
tmp <- apply(gs[which(grepl('IDPOP|HOUSE',rownames(gs))), ],2,function(x) !all(x%in%c('?','n','-')))
gs <- gs[,tmp]
gap.p.sim <- round( sum(gs=='?') / (nrow(gs)*ncol(gs)), d=2 )
# determine gene start and end positions
tmp <- c( regexpr('agataggggggcaac', paste(gs[which(grepl('HXB2',rownames(gs))), ], collapse='') ),
regexpr('atgagagtgaaggagaa', paste(gs[which(grepl('HXB2',rownames(gs))), ], collapse='') ) )
gene.pos <- data.table(GENE=c('gag','pol','env'), START=c(1,tmp[1],tmp[2]), END=c(tmp[1]-1L,tmp[2]-1L,ncol(gs)))
# write to file
gs <- as.DNAbin(gs)
outfile <- paste(gsub('_SIMULATED','',infile.simu), sprintf('%02g',gap.p*100), '_FULL_SIMULATED.fa', sep='')
write.dna(gs, file=file.path(outdir, outfile), format='fasta', colsep='', nbcol=-1)
tmp <- gene.pos[, list(STR=paste('DNA, ',GENE,' = ',START,'-',END,sep='')), by='GENE']
cat(paste( tmp[, STR], collapse='\n'), file=file.path(outdir, gsub('.fa','_gene.txt',outfile)))
}
}
if(0)
{
indir.simu <- '/Users/Oliver/git/HPTN071sim/treecomparison/nogaps'
indir.gap <- '~/git/HPTN071sim/treecomparison/PANGEAcov'
infile.gap <- '150623_PANGEAGlobal2681_C5.fa'
outdir <- '/Users/Oliver/git/HPTN071sim/treecomparison/withgaps_160729'
gap.symbol <- '?'
gap.seed <- 42
infile.gap <- '151113_PANGEAGlobal4562_C10.fa'
outfile.cov <- regmatches(infile.gap,regexpr('C[0-9]+',basename(infile.gap)))
infile.simu <- '150701_Regional_TRAIN2_SIMULATED'
infile.simu <- '150701_Regional_TRAIN1_SIMULATED'
# align and rbind simulated and real sequences, rm gap rows and trailing gap cols too:
ms <- PANGEA.add.gaps.merge.and.maintain.triplets(indir.simu, indir.gap, infile.simu, infile.gap, verbose=1)
write.dna(ms, file=paste(indir.simu,'/',infile.simu,'_RMGPS.fa',sep=''),format='fasta', colsep='', nbcol=-1)
# this now includes HXB2 --> easy to define start + end of gag pol env (or any other restriction that I want to specify)
#
# process full genome runs
#
# make allocations reproducible
set.seed(gap.seed)
# read chunks from selected gap country sequence (is NA, then read from all PANGEA seqs)
sq <- ms[ grepl('PG[0-9]+', rownames(ms)), ]
ch <- seq.get.gap.chunks(sq, gap.symbol=c('?','n'))
setnames(ch, 'TAXON', 'PANGEA_ID')
ch[, SITE:= gsub('PG[0-9]+-','',regmatches(PANGEA_ID, regexpr('PG[0-9]+-[A-Z]+',PANGEA_ID))) ]
for(gap.p in seq(0.02, 0.6, 0.01))
#for(gap.p in seq(0.5, 0.6, 0.01))
{
cat('FULL',gap.p)
# gap coverage per PANGEA id
chs <- PANGEA.add.gaps.allocate.chunks.to.sequences(ch, ms)
# select chunks that give actual coverage closest to requested coverage
tmp <- chs[which.min(abs(GAP_P-gap.p)), IDX_COARSE]
chs <- subset(chs, IDX_COARSE<=tmp)
# copy gaps into alignment
gs <- as.character(ms)
for(i in seq_len(nrow(chs)))
gs[ chs[i,COPY_TO], chs[i, seq.int(POS, len=REP)] ] <- gap.symbol
# select simulated sequences
gs <- gs[which(grepl('IDPOP|HOUSE|HXB2',rownames(gs))), ]
tmp <- apply(gs[which(grepl('IDPOP|HOUSE',rownames(gs))), ],2,function(x) !all(x%in%c('?','n','-')))
gs <- gs[,tmp]
gap.p.sim <- round( sum(gs=='?') / (nrow(gs)*ncol(gs)), d=2 )
# determine gene start and end positions
tmp <- c( regexpr('agataggggggcaac', paste(gs[which(grepl('HXB2',rownames(gs))), ], collapse='') ),
regexpr('atgagagtgaaggagaa', paste(gs[which(grepl('HXB2',rownames(gs))), ], collapse='') ) )
gene.pos <- data.table(GENE=c('gag','pol','env'), START=c(1,tmp[1],tmp[2]), END=c(tmp[1]-1L,tmp[2]-1L,ncol(gs)))
# write to file
gs <- as.DNAbin(gs)
outfile <- paste(gsub('_SIMULATED','',infile.simu), sprintf('%02g',gap.p*100), '_FULL_SIMULATED.fa', sep='')
write.dna(gs, file=file.path(outdir, outfile), format='fasta', colsep='', nbcol=-1)
tmp <- gene.pos[, list(STR=paste('DNA, ',GENE,' = ',START,'-',END,sep='')), by='GENE']
cat(paste( tmp[, STR], collapse='\n'), file=file.path(outdir, gsub('.fa','_gene.txt',outfile)))
}
#fi <- data.table(F=list.files('/Users/Oliver/git/HPTN071sim/treecomparison/withgaps_160729', full.names=TRUE, pattern='^TMP'))
#fi[, O:= gsub('TRAIN1_conc.fa|TRAIN1_conc_gene.txt','TRAIN1',gsub('TMP15','15',F))]
#fi[, file.rename(F,O), by='F']
#
# process p17 genome runs
#
# determine p24 start, and use only gag p17
hxb2 <- as.character(ms[which(grepl('HXB2',rownames(ms))), ])
tmp <- regexpr('c-*c-*t-*a-*t-*a-*g-*t-*g-*c-*a-*g-*a-*a-*c-*a-*t-*c-*c-*a-*g-*g-*g', paste(hxb2, collapse='') )
msp <- ms[,1:(tmp-1L)]
# make allocations reproducible
set.seed(gap.seed)
# read chunks from selected gap country sequence (is NA, then read from all PANGEA seqs)
sq <- msp[ grepl('PG[0-9]+', rownames(msp)), ]
ch <- seq.get.gap.chunks(sq, gap.symbol=c('?','n'))
setnames(ch, 'TAXON', 'PANGEA_ID')
ch[, SITE:= gsub('PG[0-9]+-','',regmatches(PANGEA_ID, regexpr('PG[0-9]+-[A-Z]+',PANGEA_ID))) ]
for(gap.p in seq(0.02, 0.3, 0.01))
{
cat('P17',gap.p)
# gap coverage per PANGEA id -- this is the random bit so should be in the loop
chs <- PANGEA.add.gaps.allocate.chunks.to.sequences(ch, msp)
# select chunks that give actual coverage closest to requested coverage
tmp <- chs[which.min(abs(GAP_P-gap.p)), IDX_COARSE]
chs <- subset(chs, IDX_COARSE<=tmp)
# copy gaps into alignment
gs <- as.character(msp)
for(i in seq_len(nrow(chs)))
gs[ chs[i,COPY_TO], chs[i, seq.int(POS, len=REP)] ] <- gap.symbol
# select simulated sequences
gs <- gs[which(grepl('IDPOP|HOUSE|HXB2',rownames(gs))), ]
tmp <- apply(gs[which(grepl('IDPOP|HOUSE',rownames(gs))), ],2,function(x) !all(x%in%c('?','n','-')))
gs <- gs[,tmp]
gap.p.sim <- round( sum(gs=='?') / (nrow(gs)*ncol(gs)), d=2 )
# write to file
gs <- as.DNAbin(gs)
outfile <- paste(gsub('_SIMULATED','',infile.simu), sprintf('%02g',gap.p*100), '_P17_SIMULATED.fa', sep='')
write.dna(gs, file=file.path(outdir, outfile), format='fasta', colsep='', nbcol=-1)
}
# %gaps in PANGEA seqs
tmp <- sq[ grepl('PG[0-9]+-BW', rownames(sq)), ]
sum( as.character(tmp)=='?' ) / (nrow(tmp)*ncol(tmp))
tmp <- sq[ grepl('PG[0-9]+-UG', rownames(sq)), ]
sum( as.character(tmp)=='?' ) / (nrow(tmp)*ncol(tmp))
#
# process gag genome runs
#
# determine pol start, and use only gag
hxb2 <- as.character(ms[which(grepl('HXB2',rownames(ms))), ])
tmp <- regexpr('a-*g-*a-*t-*a-*g-*g-*g-*g-*g-*g-*c-*a-*a-*c', paste(hxb2, collapse='') )
msg <- ms[,1:(tmp-1L)]
# make allocations reproducible
set.seed(gap.seed)
# read chunks from selected gap country sequence (is NA, then read from all PANGEA seqs)
sq <- msg[ grepl('PG[0-9]+', rownames(msg)), ]
ch <- seq.get.gap.chunks(sq, gap.symbol=c('?','n'))
setnames(ch, 'TAXON', 'PANGEA_ID')
ch[, SITE:= gsub('PG[0-9]+-','',regmatches(PANGEA_ID, regexpr('PG[0-9]+-[A-Z]+',PANGEA_ID))) ]
for(gap.p in seq(0.02, 0.3, 0.01))
{
cat('GAG',gap.p)
# gap coverage per PANGEA id -- this is the random bit so should be in the loop
chs <- PANGEA.add.gaps.allocate.chunks.to.sequences(ch, msg)
# select chunks that give actual coverage closest to requested coverage
tmp <- chs[which.min(abs(GAP_P-gap.p)), IDX_COARSE]
chs <- subset(chs, IDX_COARSE<=tmp)
# copy gaps into alignment
gs <- as.character(msg)
for(i in seq_len(nrow(chs)))
gs[ chs[i,COPY_TO], chs[i, seq.int(POS, len=REP)] ] <- gap.symbol
# select simulated sequences
gs <- gs[which(grepl('IDPOP|HOUSE|HXB2',rownames(gs))), ]
tmp <- apply(gs[which(grepl('IDPOP|HOUSE',rownames(gs))), ],2,function(x) !all(x%in%c('?','n','-')))
gs <- gs[,tmp]
gap.p.sim <- round( sum(gs=='?') / (nrow(gs)*ncol(gs)), d=2 )
# write to file
gs <- as.DNAbin(gs)
outfile <- paste(gsub('_SIMULATED','',infile.simu), sprintf('%02g',gap.p*100), '_GAG_SIMULATED.fa', sep='')
write.dna(gs, file=file.path(outdir, outfile), format='fasta', colsep='', nbcol=-1)
}
#
# process gag+pol prot rt genome runs
#
# determine pol start, and use only gag
hxb2 <- as.character(ms[which(grepl('HXB2',rownames(ms))), ])
tmp <- regexpr('t-*a-*t-*g-*t-*a-*g-*a-*t-*g-*g-*g-*g-*c-*a-*g-*c-*t-*a-*a-*c-*a-*g-*g-*g-*a-*g', paste(hxb2, collapse='') )
msgp <- ms[,1:(tmp-1L)]
# make allocations reproducible
set.seed(gap.seed)
# read chunks from selected gap country sequence (is NA, then read from all PANGEA seqs)
sq <- msgp[ grepl('PG[0-9]+', rownames(msgp)), ]
ch <- seq.get.gap.chunks(sq, gap.symbol=c('?','n'))
setnames(ch, 'TAXON', 'PANGEA_ID')
ch[, SITE:= gsub('PG[0-9]+-','',regmatches(PANGEA_ID, regexpr('PG[0-9]+-[A-Z]+',PANGEA_ID))) ]
for(gap.p in seq(0.02, 0.45, 0.01))
{
cat('GAG POL',gap.p)
# gap coverage per PANGEA id -- this is the random bit so should be in the loop
chs <- PANGEA.add.gaps.allocate.chunks.to.sequences(ch, msgp)
# select chunks that give actual coverage closest to requested coverage
tmp <- chs[which.min(abs(GAP_P-gap.p)), IDX_COARSE]
chs <- subset(chs, IDX_COARSE<=tmp)
# copy gaps into alignment
gs <- as.character(msgp)
for(i in seq_len(nrow(chs)))
gs[ chs[i,COPY_TO], chs[i, seq.int(POS, len=REP)] ] <- gap.symbol
# select simulated sequences
gs <- gs[which(grepl('IDPOP|HOUSE|HXB2',rownames(gs))), ]
tmp <- apply(gs[which(grepl('IDPOP|HOUSE',rownames(gs))), ],2,function(x) !all(x%in%c('?','n','-')))
gs <- gs[,tmp]
gap.p.sim <- round( sum(gs=='?') / (nrow(gs)*ncol(gs)), d=2 )
# write to file
gs <- as.DNAbin(gs)
outfile <- paste(gsub('_SIMULATED','',infile.simu), sprintf('%02g',gap.p*100), '_GAGPP_SIMULATED.fa', sep='')
write.dna(gs, file=file.path(outdir, outfile), format='fasta', colsep='', nbcol=-1)
}
# %gaps in PANGEA seqs
tmp <- sq[ grepl('PG[0-9]+-BW', rownames(sq)), ]
sum( as.character(tmp)=='?' ) / (nrow(tmp)*ncol(tmp))
tmp <- sq[ grepl('PG[0-9]+-UG', rownames(sq)), ]
sum( as.character(tmp)=='?' ) / (nrow(tmp)*ncol(tmp))
#
# save all fasta files as R files without HXB2
#
infile <- data.table(FILE=list.files(outdir, pattern='\\.fa'))
infile[, SC:= gsub('\\..*','',FILE)]
invisible( infile[, {
#FILE<- '150701_Regional_TRAIN202_FULL_SIMULATED.fa'
sq <- read.dna(file.path(outdir,FILE), format='fa')
sq <- sq[!grepl('HXB2', rownames(sq)), ]
save(sq, file=file.path(outdir, gsub('.fa','.R',FILE)))
}, by='SC'] )
}
}
##--------------------------------------------------------------------------------------------------------
## olli 01.07.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.treecomparison.gaps.simulate.gtrfixed<- function()
{
indir.simu <- '~/git/HPTN071sim/treecomparison/nogaps_161121'
indir.gap <- '~/git/HPTN071sim/treecomparison/PANGEAcov'
infile.gap <- '151113_PANGEAGlobal4562_C10.fa'
outdir <- '~/git/HPTN071sim/treecomparison/withgaps_161121'
gap.symbol <- '?'
gap.seed <- 42
gap.country <- 'BW'
outfile.cov <- regmatches(infile.gap,regexpr('C[0-9]+',basename(infile.gap)))
infile.simu <- '161121_HPTN071_GTRFIXED2_SIMULATED'
# align and rbind simulated and real sequences, rm gap rows and trailing gap cols too:
ms <- PANGEA.add.gaps.merge.and.maintain.triplets(indir.simu, indir.gap, infile.simu, infile.gap, prefix.simulation='GTRFIXED', verbose=1)
write.dna(ms, file=paste(indir.simu,'/',infile.simu,'_RMGPS\\.fa',sep=''),format='fasta', colsep='', nbcol=-1)
# this now includes HXB2 --> easy to define start + end of gag pol env (or any other restriction that I want to specify)
ans <- PANGEA.add.gaps.simulate(ms, paste('PG[0-9]+-',gap.country,sep=''), gap.symbol, gap.seed, prefix.simu='IDPOP|HOUSE', with.hxb2=1, strip.gaps=1, verbose=1)
# save FULL without HXB2
outfile <- gsub('SIMULATED','FULL_SIMULATED.fasta',gsub('GTRFIXED2_','GTRFIXED2_',gsub('HPTN071_','',infile.simu)))
write.dna(ans[ !grepl('HXB2',rownames(ans)), ], file=paste(outdir, outfile, sep='/'), format='fasta', colsep='', nbcol=-1)
# write FULL partition table
tmp <- c( regexpr('agataggggggcaac', paste(ans[which(grepl('HXB2',rownames(ans))), ], collapse='') ),
regexpr('atgagagtgaaggagaa', paste(ans[which(grepl('HXB2',rownames(ans))), ], collapse='') ) )
gene.pos <- data.table(GENE=c('gag','pol','env'), START=c(1,tmp[1],tmp[2]), END=c(tmp[1]-1L,tmp[2]-1L,ncol(ans)))
tmp <- gene.pos[, list(STR=paste('DNA, ',GENE,' = ',START,'-',END,sep='')), by='GENE']
cat(paste( tmp[, STR], collapse='\n'), file=file.path(outdir, gsub('.fasta','_gene.txt',outfile)))
# save P17 without HXB2
outfile <- gsub('SIMULATED','P17_SIMULATED.fasta',gsub('GTRFIXED2_','GTRFIXED2_',gsub('HPTN071_','',infile.simu)))
hxb2 <- as.character(ans[which(grepl('HXB2',rownames(ans))), ])
tmp <- regexpr('c-*c-*t-*a-*t-*a-*g-*t-*g-*c-*a-*g-*a-*a-*c-*a-*t-*c-*c-*a-*g-*g-*g', paste(hxb2, collapse='') )
write.dna(ans[!grepl('HXB2',rownames(ans)),1:(tmp-1L)], file=paste(outdir, outfile, sep='/'), format='fasta', colsep='', nbcol=-1)
# save GAG without HXB2
outfile <- gsub('SIMULATED','GAG_SIMULATED.fasta',gsub('GTRFIXED2_','GTRFIXED2_',gsub('HPTN071_','',infile.simu)))
hxb2 <- as.character(ans[which(grepl('HXB2',rownames(ans))), ])
tmp <- regexpr('a-*g-*a-*t-*a-*g-*g-*g-*g-*g-*g-*c-*a-*a-*c', paste(hxb2, collapse='') )
write.dna(ans[!grepl('HXB2',rownames(ans)),1:(tmp-1L)], file=paste(outdir, outfile, sep='/'), format='fasta', colsep='', nbcol=-1)
# clean up
tmp <- list.files(indir.simu, pattern='^TMP', full.names=TRUE)
file.remove(tmp)
#
#
#
gap.country <- 'UG'
outfile.cov <- regmatches(infile.gap,regexpr('C[0-9]+',basename(infile.gap)))
infile.simu <- '161121_HPTN071_GTRFIXED2_SIMULATED'
ans <- PANGEA.add.gaps.simulate(ms, paste('PG[0-9]+-',gap.country,sep=''), gap.symbol, gap.seed, prefix.simu='IDPOP|HOUSE', with.hxb2=1, strip.gaps=1, verbose=1)
# save FULL without HXB2
outfile <- gsub('SIMULATED','FULL_SIMULATED.fasta',gsub('GTRFIXED2_','GTRFIXED3_',gsub('HPTN071_','',infile.simu)))
write.dna(ans[ !grepl('HXB2',rownames(ans)), ], file=paste(outdir, outfile, sep='/'), format='fasta', colsep='', nbcol=-1)
# write FULL partition table
tmp <- c( regexpr('agataggggggcaac', paste(ans[which(grepl('HXB2',rownames(ans))), ], collapse='') ),
regexpr('atgagagtgaaggagaa', paste(ans[which(grepl('HXB2',rownames(ans))), ], collapse='') ) )
gene.pos <- data.table(GENE=c('gag','pol','env'), START=c(1,tmp[1],tmp[2]), END=c(tmp[1]-1L,tmp[2]-1L,ncol(ans)))
tmp <- gene.pos[, list(STR=paste('DNA, ',GENE,' = ',START,'-',END,sep='')), by='GENE']
cat(paste( tmp[, STR], collapse='\n'), file=file.path(outdir, gsub('.fasta','_gene.txt',outfile)))
# save P17 without HXB2
outfile <- gsub('SIMULATED','P17_SIMULATED.fasta',gsub('GTRFIXED2_','GTRFIXED3_',gsub('HPTN071_','',infile.simu)))
hxb2 <- as.character(ans[which(grepl('HXB2',rownames(ans))), ])
tmp <- regexpr('c-*c-*t-*a-*t-*a-*g-*t-*g-*c-*a-*g-*a-*a-*c-*a-*t-*c-*c-*a-*g-*g-*g', paste(hxb2, collapse='') )
write.dna(ans[!grepl('HXB2',rownames(ans)),1:(tmp-1L)], file=paste(outdir, outfile, sep='/'), format='fasta', colsep='', nbcol=-1)
# save GAG without HXB2
outfile <- gsub('SIMULATED','GAG_SIMULATED.fasta',gsub('GTRFIXED2_','GTRFIXED3_',gsub('HPTN071_','',infile.simu)))
hxb2 <- as.character(ans[which(grepl('HXB2',rownames(ans))), ])
tmp <- regexpr('a-*g-*a-*t-*a-*g-*g-*g-*g-*g-*g-*c-*a-*a-*c', paste(hxb2, collapse='') )
write.dna(ans[!grepl('HXB2',rownames(ans)),1:(tmp-1L)], file=paste(outdir, outfile, sep='/'), format='fasta', colsep='', nbcol=-1)
#
# non-gaps files
#
files <- data.table(FILE=list.files(indir.simu, pattern='GTRFIXED.*\\.fa.*$'))
files[, SIMU:=files[, regmatches(FILE,regexpr('GTRFIXED[0-9]', FILE))]]
files[, GENE:=files[, sapply(strsplit(FILE,'_'), '[[', 5)]]
set(files, NULL, 'GENE', toupper(files[, substr(GENE,1,nchar(GENE)-3)]))
set(files, NULL, 'GENE', files[, factor(GENE, levels=c('GAG','POL','ENV'), labels=c('GAG','POL','ENV'))])
setkey(files, SIMU, GENE)
files[, {
gag <- read.dna( paste(indir.simu, FILE[1], sep='/'), format='fasta' )
pol <- read.dna( paste(indir.simu, FILE[2], sep='/'), format='fasta' )
env <- read.dna( paste(indir.simu, FILE[3], sep='/'), format='fasta' )
write.dna( gag[,1:387], file=paste(outdir, '/', '161121_GTRFIXED1_P17_SIMULATED.fasta', sep=''), format='fasta', colsep='', nbcol=-1)
write.dna( gag, file=paste(outdir, '/', '161121_GTRFIXED1_GAG_SIMULATED.fasta', sep=''), format='fasta', colsep='', nbcol=-1)
tmp <- cbind(gag, pol, env)
write.dna( tmp, file=paste(outdir, '/', '161121_GTRFIXED1_FULL_SIMULATED.fasta', sep=''), format='fasta', colsep='', nbcol=-1)
gene.pos <- data.table(GENE=c('gag','pol','env'), START=c(1,ncol(gag)+1,ncol(gag)+ncol(pol)+1), END=c(ncol(gag),ncol(gag)+ncol(pol),ncol(tmp)))
tmp <- gene.pos[, list(STR=paste('DNA, ',GENE,' = ',START,'-',END,sep='')), by='GENE']
cat(paste( tmp[, STR], collapse='\n'), file=paste(outdir, '/', '161121_GTRFIXED1_FULL_SIMULATED_gene.txt', sep=''))
NULL
}, by='SIMU']
}
##--------------------------------------------------------------------------------------------------------
## olli 19.04.17
##--------------------------------------------------------------------------------------------------------
project.PANGEA.treecomparison.run.FASTtree<- function()
{
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/simulations'
outdir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/FastTree'
infiles <- data.table(F=list.files(indir, pattern='^150701_Regional_TRAIN.*fa', full.names=TRUE))
infiles <- subset(infiles, !grepl('TRAIN3|TRAIN5',F))
infiles <- infiles[,{
#F <- '/Users/Oliver/Dropbox (SPH Imperial College)/PANGEAHIVsim/201507_TreeReconstruction/simulations/150701_Regional_TRAIN2_SIMULATED.fa'
#M <- 'raw'
cat('process sequences', F, '\n')
# load simulated alignment
seq <- read.dna(F, format='fasta')
# delete empty / very small sequences
tmp <- seq.length(seq, exclude=c('-','?','n'))
seq <- seq[ names(tmp)[tmp>=100], ]
tmp <- file.path(outdir,gsub('SIMULATED.fa','SIMULATED_min100.fasta',basename(F)))
write.dna(seq, file=tmp, format='fasta', colsep='', nbcol=-1)
list(F_SUB= tmp)
}, by=c('F')]
infiles <- data.table(expand.grid(F=list.files(outdir, pattern='fasta$', full.names=TRUE), REP=1:10, stringsAsFactors=FALSE))
infiles <- infiles[, {
cmd <- cmd.fasttree(F, outfile=gsub('\\.fasta',paste0('_REP',REP,'_fasttreedefault.newick'),F), pr.args=paste0('-nt -gtr -gamma -seed ',REP*42L))
tmp <- cmd.fasttree(F, outfile=gsub('\\.fasta',paste0('_REP',REP,'_fasttreeslow.newick'),F), pr.args=paste0('-nt -gtr -slow -gamma -seed ',REP*42L))
cmd <- paste(cmd, tmp, sep='\n')
tmp <- cmd.fasttree(F, outfile=gsub('\\.fasta',paste0('_REP',REP,'_fasttreeslowpseudo.newick'),F), pr.args=paste0('-nt -gtr -slow -pseudo -gamma -seed ',REP*42L))
cmd <- paste(cmd, tmp, sep='\n')
list(CMD=cmd)
}, by=c('F','REP')]
infiles[, {
cat(paste("#!/bin/sh", paste(CMD, collapse='\n'), sep='\n'), file= gsub('fasta','sh',F))
},by=c('F')]
}
##--------------------------------------------------------------------------------------------------------
## olli 01.07.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.treecomparison.gaps.simulate<- function()
{
indir.simu <- '/Users/Oliver/git/HPTN071sim/treecomparison/nogaps'
indir.gap <- '~/git/HPTN071sim/treecomparison/PANGEAcov'
infile.gap <- '150623_PANGEAGlobal2681_C5.fa'
outdir <- '/Users/Oliver/git/HPTN071sim/treecomparison/withgaps_160729'
gap.symbol <- '?'
gap.seed <- 42
if(0)
{
gap.country <- 'BW'
infile.gaps <- sapply(c(5),function(x) paste('150623_PANGEAGlobal2681_C',x,'.fa',sep=''))
invisible(lapply(infile.gaps, function(infile.gap)
{
outfile.cov <- regmatches(infile.gap,regexpr('C[0-9]+',basename(infile.gap)))
infile.simu <- '150701_Regional_TRAIN1_SIMULATED'
outfile <- paste(infile.simu, '_', gap.country, outfile.cov, '.fa', sep='')
ans <- PANGEA.add.gaps.merge.and.maintain.triplets(indir.simu, indir.gap, infile.simu, infile.gap, gap.country, gap.symbol, gap.seed, outfile=outfile, verbose=1)
write.dna(ans, file=paste(outdir, outfile, sep='/'), format='fasta', colsep='', nbcol=-1)
infile.simu <- '150701_Regional_TRAIN2_SIMULATED'
outfile <- paste(infile.simu, '_', gap.country, outfile.cov, '.fa', sep='')
ans <- PANGEA.add.gaps.merge.and.maintain.triplets(indir.simu, indir.gap, infile.simu, infile.gap, gap.country, gap.symbol, gap.seed, outfile=outfile, verbose=1)
write.dna(ans, file=paste(outdir, outfile, sep='/'), format='fasta', colsep='', nbcol=-1)
infile.simu <- '150701_Regional_TRAIN3_SIMULATED'
outfile <- paste(infile.simu, '_', gap.country, outfile.cov, '.fa', sep='')
ans <- PANGEA.add.gaps.merge.and.maintain.triplets(indir.simu, indir.gap, infile.simu, infile.gap, gap.country, gap.symbol, gap.seed, outfile=outfile, verbose=1)
write.dna(ans, file=paste(outdir, outfile, sep='/'), format='fasta', colsep='', nbcol=-1)
}))
}
if(0)
{
gap.country <- 'UG'
infile.gaps <- sapply(c(5),function(x) paste('150623_PANGEAGlobal2681_C',x,'.fa',sep=''))
invisible(lapply(infile.gaps, function(infile.gap)
{
outfile.cov <- regmatches(infile.gap,regexpr('C[0-9]+',basename(infile.gap)))
infile.simu <- '150701_Regional_TRAIN1_SIMULATED'
outfile <- paste(infile.simu, '_', gap.country, outfile.cov, '.fa', sep='')
ans <- PANGEA.add.gaps.merge.and.maintain.triplets(indir.simu, indir.gap, infile.simu, infile.gap, gap.country, gap.symbol, gap.seed, outfile=outfile, verbose=1)
write.dna(ans, file=paste(outdir, outfile, sep='/'), format='fasta', colsep='', nbcol=-1)
infile.simu <- '150701_Regional_TRAIN2_SIMULATED'
outfile <- paste(infile.simu, '_', gap.country, outfile.cov, '.fa', sep='')
ans <- PANGEA.add.gaps.merge.and.maintain.triplets(indir.simu, indir.gap, infile.simu, infile.gap, gap.country, gap.symbol, gap.seed, outfile=outfile, verbose=1)
write.dna(ans, file=paste(outdir, outfile, sep='/'), format='fasta', colsep='', nbcol=-1)
infile.simu <- '150701_Regional_TRAIN3_SIMULATED'
outfile <- paste(infile.simu, '_', gap.country, outfile.cov, '.fa', sep='')
ans <- PANGEA.add.gaps.merge.and.maintain.triplets(indir.simu, indir.gap, infile.simu, infile.gap, gap.country, gap.symbol, gap.seed, outfile=outfile, verbose=1)
write.dna(ans, file=paste(outdir, outfile, sep='/'), format='fasta', colsep='', nbcol=-1)
}))
}
if(0)
{
gap.country <- 'BW'
infile.gaps <- sapply(c(5),function(x) paste('150623_PANGEAGlobal2681_C',x,'.fa',sep=''))
invisible(lapply(infile.gaps, function(infile.gap)
{
outfile.cov <- regmatches(infile.gap,regexpr('C[0-9]+',basename(infile.gap)))
infile.simu <- 'Vill_99_Apr15'
outfile <- paste(infile.simu, '_', gap.country, outfile.cov, '.fa', sep='')
ans <- PANGEA.add.gaps.merge.and.maintain.triplets(indir.simu, indir.gap, infile.simu, infile.gap, gap.country, gap.symbol, gap.seed, outfile=outfile, verbose=1)
write.dna(ans, file=paste(outdir, outfile, sep='/'), format='fasta', colsep='', nbcol=-1)
}))
gap.country <- 'UG'
infile.gaps <- sapply(c(5),function(x) paste('150623_PANGEAGlobal2681_C',x,'.fa',sep=''))
invisible(lapply(infile.gaps, function(infile.gap)
{
outfile.cov <- regmatches(infile.gap,regexpr('C[0-9]+',basename(infile.gap)))
infile.simu <- 'Vill_99_Apr15'
outfile <- paste(infile.simu, '_', gap.country, outfile.cov, '.fa', sep='')
ans <- PANGEA.add.gaps.merge.and.maintain.triplets(indir.simu, indir.gap, infile.simu, infile.gap, gap.country, gap.symbol, gap.seed, outfile=outfile, verbose=1)
write.dna(ans, file=paste(outdir, outfile, sep='/'), format='fasta', colsep='', nbcol=-1)
}))
}
if(1)
{
gap.country <- 'BW'
infile.gaps <- sapply(c(5),function(x) paste('150623_PANGEAGlobal2681_C',x,'.fa',sep=''))
invisible(lapply(infile.gaps, function(infile.gap)
{
outfile.cov <- regmatches(infile.gap,regexpr('C[0-9]+',basename(infile.gap)))
infile.simu <- 'Vill_98_Jul15'
outfile <- paste(infile.simu, '_', gap.country, outfile.cov, '.fa', sep='')
ans <- PANGEA.add.gaps.merge.and.maintain.triplets(indir.simu, indir.gap, infile.simu, infile.gap, gap.country, gap.symbol, gap.seed, outfile=outfile, verbose=1)
write.dna(ans, file=paste(outdir, outfile, sep='/'), format='fasta', colsep='', nbcol=-1)
}))
gap.country <- 'UG'
infile.gaps <- sapply(c(5),function(x) paste('150623_PANGEAGlobal2681_C',x,'.fa',sep=''))
invisible(lapply(infile.gaps, function(infile.gap)
{
outfile.cov <- regmatches(infile.gap,regexpr('C[0-9]+',basename(infile.gap)))
infile.simu <- 'Vill_98_Jul15'
outfile <- paste(infile.simu, '_', gap.country, outfile.cov, '.fa', sep='')
ans <- PANGEA.add.gaps.merge.and.maintain.triplets(indir.simu, indir.gap, infile.simu, infile.gap, gap.country, gap.symbol, gap.seed, outfile=outfile, verbose=1)
write.dna(ans, file=paste(outdir, outfile, sep='/'), format='fasta', colsep='', nbcol=-1)
}))
}
if(0)
{
# store non-gappy alignments as BWC0
files <- data.table(FILE=list.files(indir.simu, pattern='TRAIN.*\\.fa.*$'))
files[, SIMU:=files[, regmatches(FILE,regexpr('TRAIN[0-9]', FILE))]]
files[, GENE:=files[, sapply(strsplit(FILE,'_'), '[[', 5)]]
set(files, NULL, 'GENE', toupper(files[, substr(GENE,1,nchar(GENE)-3)]))
set(files, NULL, 'GENE', files[, factor(GENE, levels=c('GAG','POL','ENV'), labels=c('GAG','POL','ENV'))])
setkey(files, SIMU, GENE)
# concatenate simu seqs
files[, {
gag <- read.dna( paste(indir.simu, FILE[1], sep='/'), format='fasta' )
pol <- read.dna( paste(indir.simu, FILE[2], sep='/'), format='fasta' )
env <- read.dna( paste(indir.simu, FILE[3], sep='/'), format='fasta' )
tmp <- cbind(gag, pol, env)
write.dna( tmp, file=paste(outdir, '/', gsub('gag','BWC0',FILE[1]), sep=''), format='fasta', colsep='', nbcol=-1)
NULL
}, by='SIMU']
}
if(0)
{
indir.simu <- '/Users/Oliver/git/HPTN071sim/treecomparison/nogaps_161121'
outdir <- '/Users/Oliver/git/HPTN071sim/treecomparison/withgaps_161121'
# store non-gappy alignments as BWC0
files <- data.table(FILE=list.files(indir.simu, pattern='TRAIN.*\\.fa.*$'))
files[, SIMU:=files[, regmatches(FILE,regexpr('TRAIN[0-9]', FILE))]]
files[, GENE:=files[, sapply(strsplit(FILE,'_'), '[[', 5)]]
set(files, NULL, 'GENE', toupper(files[, substr(GENE,1,nchar(GENE)-3)]))
set(files, NULL, 'GENE', files[, factor(GENE, levels=c('GAG','POL','ENV'), labels=c('GAG','POL','ENV'))])
setkey(files, SIMU, GENE)
# concatenate simu seqs
files[, {
gag <- read.dna( paste(indir.simu, FILE[1], sep='/'), format='fasta' )
pol <- read.dna( paste(indir.simu, FILE[2], sep='/'), format='fasta' )
env <- read.dna( paste(indir.simu, FILE[3], sep='/'), format='fasta' )
tmp <- cbind(gag, pol, env)
write.dna( tmp, file=paste(outdir, '/', gsub('_gag','',FILE[1]), sep=''), format='fasta', colsep='', nbcol=-1)
NULL
}, by='SIMU']
}
}
##--------------------------------------------------------------------------------------------------------
## olli 03.07.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.treecomparison.gaps.Cut.150703<- function()
{
#
# cut regional data sets to 950 sequences
#
indir.nogaps <- '/Users/Oliver/git/HPTN071sim/treecomparison/nogaps'
indir.wgaps <- '/Users/Oliver/git/HPTN071sim/treecomparison/withgaps_150701'
infiles.nogapsR <- list.files(indir.nogaps, pattern='DATEDTREE|SUBSTTREE\\.newick')
infiles.nogapsR <- infiles.nogapsR[grepl('TRAIN',infiles.nogapsR)]
for(infile.nogapsR in infiles.nogapsR)
{
#infile.nogapsR <- infiles.nogapsR[1]
# create tree for 980 taxa
ph <- read.tree(paste(indir.nogaps,'/',infile.nogapsR,sep=''))
load(paste(indir.nogaps, '/', gsub('DATEDTREE.*|SUBSTTREE.*','SIMULATED_INTERNAL.R',infile.nogapsR), sep=''))
df.clu <- subset(df.inds, !is.na(TIME_SEQ))[, list(IDPOP=IDPOP, CLUN=length(IDPOP)), by='IDCLU']
setkey(df.clu, IDCLU, CLUN)
tmp <- subset(unique(df.clu), cumsum(CLUN)<980)
df.clu <- merge(df.clu, subset(tmp, select=c(IDCLU)), by='IDCLU')
dfl <- data.table(LABEL= ph$tip.label)
dfl[, ID:= seq_len(nrow(dfl))]
dfl[, IDPOP:= dfl[, as.integer(substring(sapply(strsplit(LABEL, '|', fixed=1),'[[',1),7))]]
df.clu <- merge(dfl, df.clu, by='IDPOP')
ph <- drop.tip(ph, setdiff(ph$tip.label,df.clu[, LABEL]))
write.tree(ph,paste(indir.nogaps,'/',gsub('\\.newick','_TAXA980\\.newick',infile.nogapsR),sep=''))
# create alignment of 980 taxa
seq <- read.dna(paste(indir.wgaps, '/', gsub('DATEDTREE.*|SUBSTTREE.*','SIMULATED_BWC5.fa',infile.nogapsR), sep=''), format='fasta')
seq <- seq[df.clu[,LABEL],]
write.dna(seq, file=paste(indir.wgaps, '/', gsub('DATEDTREE.*|SUBSTTREE.*','SIMULATED_BWC5_TAXA980.fa',infile.nogapsR), sep=''), format='fasta', colsep='', nbcol=-1)
seq <- read.dna(paste(indir.wgaps, '/', gsub('DATEDTREE.*|SUBSTTREE.*','SIMULATED_UGC5.fa',infile.nogapsR), sep=''), format='fasta')
seq <- seq[df.clu[,LABEL],]
write.dna(seq, file=paste(indir.wgaps, '/', gsub('DATEDTREE.*|SUBSTTREE.*','SIMULATED_UGC5_TAXA980.fa',infile.nogapsR), sep=''), format='fasta', colsep='', nbcol=-1)
# copy partition files
file.copy( paste(indir.wgaps, '/', gsub('DATEDTREE.*|SUBSTTREE.*','SIMULATED_BWC5_gene.txt',infile.nogapsR), sep=''),
paste(indir.wgaps, '/', gsub('DATEDTREE.*|SUBSTTREE.*','SIMULATED_BWC5_TAXA980_gene.txt',infile.nogapsR), sep='') )
file.copy( paste(indir.wgaps, '/', gsub('DATEDTREE.*|SUBSTTREE.*','SIMULATED_BWC5_codon.txt',infile.nogapsR), sep=''),
paste(indir.wgaps, '/', gsub('DATEDTREE.*|SUBSTTREE.*','SIMULATED_BWC5_TAXA980_codon.txt',infile.nogapsR), sep='') )
file.copy( paste(indir.wgaps, '/', gsub('DATEDTREE.*|SUBSTTREE.*','SIMULATED_UGC5_gene.txt',infile.nogapsR), sep=''),
paste(indir.wgaps, '/', gsub('DATEDTREE.*|SUBSTTREE.*','SIMULATED_UGC5_TAXA980_gene.txt',infile.nogapsR), sep='') )
file.copy( paste(indir.wgaps, '/', gsub('DATEDTREE.*|SUBSTTREE.*','SIMULATED_UGC5_codon.txt',infile.nogapsR), sep=''),
paste(indir.wgaps, '/', gsub('DATEDTREE.*|SUBSTTREE.*','SIMULATED_UGC5_TAXA980_codon.txt',infile.nogapsR), sep='') )
}
#
# create CUTs
#
indir.wgaps <- '/Users/Oliver/git/HPTN071sim/treecomparison/withgapstrees_150703'
infiles <- data.table(FILE=list.files(indir.wgaps, pattern='\\.fa$'))
infiles <- subset(infiles, grepl('BWC5|UGC5',FILE))
invisible(lapply(infiles[, FILE],function(file)
{
print(file)
seq <- read.dna( paste(indir.wgaps,'/',file,sep=''), format='fasta' )
dc <- data.table(LABEL=rownames(seq), CVG=seq.length(seq, exclude=c('-','?')) / seq.length(seq, exclude=c('-')))
# keep taxa with ACGT >x%
dc <- do.call('rbind',lapply(c(0.05, 0.1, 0.2, 0.3, 0.6), function(cut)
{
tmp <- subset(dc, CVG>=cut)
tmp[, CUT:=cut]
tmp
}))
# for each cut, create extra data set
seqc <- copy(seq)
invisible(dc[,{
seq <- seqc[LABEL,]
cat(paste('\nNumber of taxa in reduced alignment n=',nrow(seq)))
save(seq, file=paste(indir.wgaps,'/',gsub('\\.fa',paste('_CUT',CUT*100,'\\.R',sep=''),file),sep='') )
}, by='CUT'])
NULL
}))
infiles <- data.table(FILE=list.files(indir.wgaps, pattern='\\.R$'))
infiles <- subset(infiles, grepl('CUT',FILE))
infiles[, PARTITION:= gsub('\\.R','_gene.txt',FILE)]
infiles[, {
file.copy( paste(indir.wgaps,'/',gsub('_CUT[0-9]+','',PARTITION),sep=''), paste(indir.wgaps,'/',PARTITION,sep=''))
}, by='FILE']
}
##--------------------------------------------------------------------------------------------------------
## olli 02.07.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.treecomparison.gaps.PartitionRegional<- function()
{
indir.nogaps <- '/Users/Oliver/git/HPTN071sim/treecomparison/nogaps'
indir.wgaps <- '/Users/Oliver/git/HPTN071sim/treecomparison/withgaps_150701'
infiles.wgaps <- list.files(indir.wgaps, pattern='^150701_Regional_TRAIN.*[0-9]\\.fa$')
# create partition files
for(infile.wgaps in infiles.wgaps)
{
seq <- read.dna(file=paste(indir.wgaps, '/', infile.wgaps, sep=''), format='fasta')
# determine partition lengths
# pol start
infile.nogaps <- list.files(indir.nogaps, pattern='pol\\.fa$')
infile.nogaps <- infile.nogaps[ grepl(gsub('_SIMULATED.*','',infile.wgaps),infile.nogaps) ]
stopifnot(length(infile.nogaps)==1)
tmp <- read.dna(paste(indir.nogaps, '/', infile.nogaps, sep=''), format='fasta')[rownames(seq)[1],]
stopifnot(nrow(tmp)==1)
key <- paste(as.character(tmp)[1,1:15], collapse='')
for(i in seq_len(nrow(seq)))
{
pol.start <- regexpr(key, paste(as.character(seq[i,]), collapse=''))
if(pol.start>1)
break
}
stopifnot(pol.start>1)
# env start
infile.nogaps <- list.files(indir.nogaps, pattern='env\\.fa$')
infile.nogaps <- infile.nogaps[ grepl(gsub('_SIMULATED.*','',infile.wgaps),infile.nogaps) ]
stopifnot(length(infile.nogaps)==1)
tmp <- read.dna(paste(indir.nogaps, '/', infile.nogaps, sep=''), format='fasta')[rownames(seq)[1],]
stopifnot(nrow(tmp)==1)
key <- paste(as.character(tmp)[1,1:15], collapse='')
for(i in seq_len(nrow(seq)))
{
env.start <- regexpr(key, paste(as.character(seq[i,]), collapse=''))
if(env.start>1)
break
}
stopifnot(env.start>1)
print(c(pol.start, env.start))
# write gene partition file
tmp <- paste(c( 'DNA, gag = 1-',pol.start-1,'\n',
'DNA, pol = ',pol.start,'-',env.start-1,'\n',
'DNA, env = ',env.start,'-',ncol(seq),'\n'), collapse='' )
infile.partition <- gsub('\\.fa.*','_gene\\.txt',infile.wgaps)
cat(file=paste(indir.wgaps,'/',infile.partition,sep=''), tmp)
# write codon partition file
tmp <- paste(c( 'DNA, gagcodon1 = 1-',pol.start-1,'\\3\n',
'DNA, gagcodon2 = 2-',pol.start-1,'\\3\n',
'DNA, gagcodon3 = 3-',pol.start-1,'\\3\n',
'DNA, polcodon1 = ',pol.start,'-',env.start-1,'\\3\n',
'DNA, polcodon2 = ',pol.start+1,'-',env.start-1,'\\3\n',
'DNA, polcodon3 = ',pol.start+2,'-',env.start-1,'\\3\n',
'DNA, envcodon1 = ',env.start,'-',ncol(seq),'\\3\n',
'DNA, envcodon2 = ',env.start+1,'-',ncol(seq),'\\3\n',
'DNA, envcodon3 = ',env.start+2,'-',ncol(seq),'\\3\n'), collapse='' )
infile.partition <- gsub('\\.fa.*','_codon\\.txt',infile.wgaps)
cat(file=paste(indir.wgaps,'/',infile.partition,sep=''), tmp)
}
}
##--------------------------------------------------------------------------------------------------------
## olli 01.07.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.treecomparison.gaps.PartitionVillage<- function()
{
indir.nogaps <- '/Users/Oliver/git/HPTN071sim/treecomparison/nogaps'
indir.wgaps <- '/Users/Oliver/git/HPTN071sim/treecomparison/withgaps_150701'
infiles.wgaps <- list.files(indir.wgaps, pattern='^Vill.*[0-9]\\.fa$')
for(infile.wgaps in infiles.wgaps)
{
print(infile.wgaps)
#infile.wgaps <- infiles.wgaps[1]
if(grepl('99_Apr15',infile.wgaps))
{
pol.key <- "tttttcagggaaaattt"
env.key <- "atgagagtgagagggaa"
}
if(grepl('98_Jul15',infile.wgaps))
{
pol.key <- "ttttttagggaaaatttagcc"#
env.key <- "atgagtgtgagaaggat"
}
seq <- read.dna(paste(indir.wgaps, '/', infile.wgaps, sep=''), format='fasta')
save(seq, file=paste(indir.wgaps,'/',gsub('\\.fa.*','\\.R',infile.wgaps),sep=''))
for(i in seq_len(nrow(seq)))
{
#print(i)
pol.start <- regexpr(tolower(pol.key), paste(as.character(seq[i,]), collapse=''))
if(pol.start>1)
break
}
stopifnot(pol.start>1)
for(i in seq_len(nrow(seq)))
{
#print(i)
env.start <- regexpr(tolower(env.key), paste(as.character(seq[i,]), collapse=''))
if(env.start<4000)
env.start <- -1
if(env.start>1)
break
}
stopifnot(env.start>1)
print(c(pol.start, env.start))
# write gene partition file
tmp <- paste(c( 'DNA, gag = 1-',pol.start-1,'\n',
'DNA, pol = ',pol.start,'-',env.start-1,'\n',
'DNA, env = ',env.start,'-',ncol(seq),'\n'), collapse='' )
infile.partition <- gsub('\\.fa.*','_gene\\.txt',infile.wgaps)
cat(file=paste(indir.wgaps,'/',infile.partition,sep=''), tmp)
# write codon partition file
tmp <- paste(c( 'DNA, gagcodon1 = 1-',pol.start-1,'\\3\n',
'DNA, gagcodon2 = 2-',pol.start-1,'\\3\n',
'DNA, gagcodon3 = 3-',pol.start-1,'\\3\n',
'DNA, polcodon1 = ',pol.start,'-',env.start-1,'\\3\n',
'DNA, polcodon2 = ',pol.start+1,'-',env.start-1,'\\3\n',
'DNA, polcodon3 = ',pol.start+2,'-',env.start-1,'\\3\n',
'DNA, envcodon1 = ',env.start,'-',ncol(seq),'\\3\n',
'DNA, envcodon2 = ',env.start+1,'-',ncol(seq),'\\3\n',
'DNA, envcodon3 = ',env.start+2,'-',ncol(seq),'\\3\n'), collapse='' )
infile.partition <- gsub('\\.fa.*','_codon\\.txt',infile.wgaps)
cat(file=paste(indir.wgaps,'/',infile.partition,sep=''), tmp)
}
}
##--------------------------------------------------------------------------------------------------------
## olli 03.07.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.treecomparison.gaps.RobinsonFould.ExaML3014.byMajorityCoverageThreshold<- function()
{
#
# compare ExaML trees to true topology
#
require(phangorn)
indir.wgaps <- '/Users/Oliver/git/HPTN071sim/treecomparison/withgapstrees_150701'
indir.nogaps <- '/Users/Oliver/git/HPTN071sim/treecomparison/nogaps_150630'
files <- data.table(SIMFILE=list.files(indir.wgaps, pattern='ExaML_result.*finaltree\\.[0-9]+', recursive=TRUE))
files[, TRUEFILE:= gsub('SIMULATED.*','DATEDTREE_lrgstclu.newick',gsub('ExaML_result\\.','',basename(SIMFILE)))]
set(files, NULL, 'TRUEFILE', files[, gsub('Vill_99_Apr15.*','Vill_99_Apr15.newick',TRUEFILE)])
files[, CNFG:= regmatches(basename(SIMFILE), regexpr('BWC[0-9]+|UGC[0-9]+',basename(SIMFILE)))]
files[, PARTITION:= 'None']
set(files, files[, which(!grepl('NOQ', basename(SIMFILE)))], 'PARTITION', '9ByGeneCodon')
files[, EXCLSEQ:= 'None']
tmp <- files[, which(grepl('CUT[0-9]+', basename(SIMFILE)))]
set(files, tmp, 'EXCLSEQ', files[tmp, regmatches(basename(SIMFILE), regexpr('CUT[0-9]+',basename(SIMFILE)))])
set(files, tmp, 'EXCLSEQ', files[tmp, substr(EXCLSEQ,4,nchar(EXCLSEQ))])
files[, DT:= regmatches(basename(SIMFILE), regexpr('TRAIN[0-9]+|Vill_99',basename(SIMFILE)))]
files[, REP:= as.numeric(regmatches(basename(SIMFILE), regexpr('[0-9]+$',basename(SIMFILE))))]
files[, CVRG:= as.numeric(regmatches(CNFG, regexpr('[0-9]+',CNFG)))]
files[, SITE:= regmatches(CNFG, regexpr('BW|UG',CNFG))]
#filess <- subset(files, EXCLSEQ!='None')
#robinson fould distance
#SIMFILE <- subset(filess, 'BWC5'==CNFG & EXCLSEQ=='80')[, SIMFILE][1]
#TRUEFILE <- subset(filess, 'BWC5'==CNFG & EXCLSEQ=='80')[, TRUEFILE][1]
drf <- files[,{
#print(SIMFILE)
stree <- unroot(read.tree(paste(indir.wgaps,'/',SIMFILE,sep='')))
otree <- unroot(read.tree(paste(indir.nogaps,'/',TRUEFILE,sep='')))
if(!is.binary.tree(stree))
stree <- multi2di(stree)
z <- setdiff(otree$tip.label, stree$tip.label)
if(length(z))
otree <- unroot(drop.tip(otree, z))
rf <- RF.dist(otree, stree, check.labels = TRUE)
list(RF=rf, NRF=rf/(2*Ntip(otree)-6))
}, by='SIMFILE']
drf <- merge(files, drf, by='SIMFILE')
#plot RF distance
ggplot(drf, aes(x=factor(CVRG), y=NRF*100, colour=SITE)) +
geom_point(colour='grey50') + geom_boxplot() +
scale_x_discrete(expand=c(0.01,0.01)) +
scale_y_continuous(breaks=seq(0,100,20),limits=c(0,100),expand=c(0,0)) +
labs(x='majority coverage\n(minimum number of aligned short read calls that agree)', y='normalized RF distance btw est and true trees\n(RF/(2*taxa-6)') +
facet_grid(~DT) +
theme_bw() + theme(panel.margin.x = unit(0.8, "lines"))
ggsave(file= paste(indir.wgaps,'/150630_ExaML_ExaML3014_MJCT_box.pdf',sep=''), w=5, h=4)
}
##--------------------------------------------------------------------------------------------------------
## olli 30.06.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.treecomparison.gaps.RobinsonFould.ByCut<- function()
{
#
# compare ExaML trees to true topology
#
require(phangorn)
indir.wgaps <- '/Users/Oliver/git/HPTN071sim/treecomparison/withgapstrees_150630'
indir.nogaps <- '/Users/Oliver/git/HPTN071sim/treecomparison/nogaps'
files <- data.table(SIMFILE=list.files(indir.wgaps, pattern='ExaML_result.*finaltree\\.[0-9]+', recursive=TRUE))
files[, TRUEFILE:= gsub('SIMULATED.*','DATEDTREE_lrgstclu.newick',gsub('ExaML_result\\.','',basename(SIMFILE)))]
set(files, NULL, 'TRUEFILE', files[, gsub('Vill_99_Apr15.*','Vill_99_Apr15.newick',TRUEFILE)])
files[, CNFG:= regmatches(basename(SIMFILE), regexpr('BWC[0-9]+|UGC[0-9]+',basename(SIMFILE)))]
files[, PARTITION:= 'None']
set(files, files[, which(!grepl('NOQ', basename(SIMFILE)))], 'PARTITION', '9ByGeneCodon')
files[, EXCLSEQ:= 'None']
tmp <- files[, which(grepl('CUT[0-9]+', basename(SIMFILE)))]
set(files, tmp, 'EXCLSEQ', files[tmp, regmatches(basename(SIMFILE), regexpr('CUT[0-9]+',basename(SIMFILE)))])
set(files, tmp, 'EXCLSEQ', files[tmp, substr(EXCLSEQ,4,nchar(EXCLSEQ))])
files[, DT:= regmatches(basename(SIMFILE), regexpr('TRAIN[0-9]+|Vill_99',basename(SIMFILE)))]
files[, REP:= as.numeric(regmatches(basename(SIMFILE), regexpr('[0-9]+$',basename(SIMFILE))))]
files[, CVRG:= as.numeric(regmatches(CNFG, regexpr('[0-9]+',CNFG)))]
files[, SITE:= regmatches(CNFG, regexpr('BW|UG',CNFG))]
filess <- subset(files, EXCLSEQ!='None')
#robinson fould distance
#SIMFILE <- subset(filess, 'BWC5'==CNFG & EXCLSEQ=='80')[, SIMFILE][1]
#TRUEFILE <- subset(filess, 'BWC5'==CNFG & EXCLSEQ=='80')[, TRUEFILE][1]
drf <- filess[,{
#print(SIMFILE)
stree <- unroot(read.tree(paste(indir.wgaps,'/',SIMFILE,sep='')))
otree <- unroot(read.tree(paste(indir.nogaps,'/',TRUEFILE,sep='')))
if(!is.binary.tree(stree))
stree <- multi2di(stree)
z <- setdiff(otree$tip.label, stree$tip.label)
if(length(z))
otree <- unroot(drop.tip(otree, z))
#https://groups.google.com/forum/#!topic/raxml/JgvxgknTeqw
#normalize with 2n-6
rf <- RF.dist(otree, stree, check.labels = TRUE)
list(RF=rf, NRF=rf/(2*Ntip(otree)-6))
}, by='SIMFILE']
drf <- merge(filess, drf, by='SIMFILE')
#plot RF distance
ggplot(drf, aes(x=factor(EXCLSEQ), y=NRF*100, colour=SITE)) +
geom_point(colour='grey50') + geom_boxplot() +
scale_x_discrete(expand=c(0.01,0.01)) +
scale_y_continuous(breaks=seq(0,100,20),limits=c(0,100),expand=c(0,0)) +
labs(x='taxa excluded if proportion of ? sites exceeds threshold\n(threshold value)', y='normalized RF distance btw est and true trees\n(RF/(2*taxa-6)') +
facet_grid(~DT) +
theme_bw() + theme(panel.margin.x = unit(0.8, "lines"))
ggsave(file= paste(indir.wgaps,'/150630_ExaML_TaxaExcluded_RF_box.pdf',sep=''), w=5, h=4)
}
##--------------------------------------------------------------------------------------------------------
## olli 12.02.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.pipeline.Feb2015.final<- function()
{
if(0)
{
indir <- '/Users/Oliver/git/HPTN071sim/source/PANGEA.HIV.sim/inst/misc'
pipeline.args <- sim.regional.args( yr.start=1985, yr.end=NA, seed=NA, s.MODEL='Fixed2Prop', report.prop.recent=1.0,
s.PREV.max.n=NA, s.INTERVENTION.prop=NA, s.INTERVENTION.start=2015, s.INTERVENTION.mul= NA, s.ARCHIVAL.n=50,
epi.model='HPTN071', epi.dt=1/48, epi.import=NA, root.edge.fixed=0,
v.N0tau=1, v.r=2.851904, v.T50=-2,
wher.mu=log(0.00447743)-0.5^2/2, wher.sigma=0.5, bwerm.mu=log(0.002239075)-0.3^2/2, bwerm.sigma=0.3, er.gamma=4,
dbg.GTRparam=0, dbg.rER=0, index.starttime.mode='fix1970', startseq.mode='one', seqtime.mode='AtDiag')
# proposed standard run and control simulation
pipeline.vary <- data.table( label= c('-sq','-ph','-tr20','-s2x','-mFP85','-y3', '-mFP25'),
yr.end= c(2020, 2020, 2020, 2020, 2020, 2018, 2020),
epi.import= c(0.05, 0.05, 0.2, 0.05, 0.05, 0.05, 0.05),
s.PREV.max.n= c(1600, 1600, 1600, 3200, 1600, 1280, 1600),
s.INTERVENTION.prop= c(0.5, 0.5, 0.5, 0.5, 0.85, 0.375, 0.25),
seed= c(42, 5, 7, 11, 13, 17, 42)
)
pipeline.vary <- pipeline.vary[7,]
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='yr.end' ), 'v', as.character(yr.end))
set(pipeline.args, which( pipeline.args$stat=='epi.import' ), 'v', as.character(epi.import))
set(pipeline.args, which( pipeline.args$stat=='s.PREV.max.n' ), 'v', as.character(s.PREV.max.n))
set(pipeline.args, which( pipeline.args$stat=='s.INTERVENTION.prop' ), 'v', as.character(s.INTERVENTION.prop))
set(pipeline.args, which( pipeline.args$stat=='s.seed' ), 'v', as.character(seed))
print(pipeline.args)
if(1)
{
# scenario A
infile.ind <- '150129_HPTN071_scA'
infile.trm <- '150129_HPTN071_scA'
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp150208-A'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
#system(file)
stop()
}
if(1)
{
# scenario B
infile.ind <- '150129_HPTN071_scB'
infile.trm <- '150129_HPTN071_scB'
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp150208-B'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
#system(file)
}
if(1)
{
# scenario C
infile.ind <- '150129_HPTN071_scC'
infile.trm <- '150129_HPTN071_scC'
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp150208-C'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
#system(file)
}
if(1)
{
# scenario D
infile.ind <- '150129_HPTN071_scD'
infile.trm <- '150129_HPTN071_scD'
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp150208-D'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
#system(file)
}
if(1)
{
# scenario E
infile.ind <- '150129_HPTN071_scE'
infile.trm <- '150129_HPTN071_scE'
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp150208-E'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
#system(file)
}
if(1)
{
# scenario F
infile.ind <- '150129_HPTN071_scF'
infile.trm <- '150129_HPTN071_scF'
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp150208-F'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
#system(file)
}
}, by='label']
}
}
##--------------------------------------------------------------------------------------------------------
## runs for Manon's clustering analysis
## olli 31.03.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.pipeline.Apr2015.Manon<- function()
{
if(0)
{
indir <- '/Users/Oliver/git/HPTN071sim/source/PANGEA.HIV.sim/inst/misc'
pipeline.args <- sim.regional.args( yr.start=1985, yr.end=2020, seed=42, s.MODEL='Prop2SuggestedSampling', report.prop.recent=1.0,
s.PREV.max=NA, s.INTERVENTION.start=2015,
epi.model='HPTN071', epi.dt=1/48, epi.import=0.2, root.edge.fixed=0,
v.N0tau=1, v.r=2.851904, v.T50=-2,
wher.mu=log(0.00447743)-0.5^2/2, wher.sigma=0.5, bwerm.mu=log(0.002239075)-0.3^2/2, bwerm.sigma=0.3, er.gamma=4,
dbg.GTRparam=0, dbg.rER=0, index.starttime.mode='fix1970', startseq.mode='one', seqtime.mode=NA)
# proposed standard run and control simulation
pipeline.vary <- data.table( label= c('-f70s3','-f70s6','-f80s3','-f80s6','-f80s3f80','-f80s6f80','-f80s3f60','-f80s6f60','-f80s3f40','-f80s6f40','-f80s3f20','-f80s6f20'),
seqtime.mode= c('AtYear3', 'AtYear6', 'AtYear3', 'AtYear6', 'AtYear3', 'AtYear6', 'AtYear3', 'AtYear6', 'AtYear3', 'AtYear6', 'AtYear3', 'AtYear6'),
s.PREV.max= c(1.0, 1.0, 1.0, 1.0, 0.8, 0.8, 0.6, 0.6, 0.4, 0.4, 0.2, 0.2),
index.starttime.mode= c('fix1970', 'fix1970', 'fix1980', 'fix1980', 'fix1980', 'fix1980', 'fix1980', 'fix1980', 'fix1980', 'fix1980', 'fix1980', 'fix1980')
)
pipeline.vary <- pipeline.vary[5:12,]
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='index.starttime.mode' ), 'v', as.character(index.starttime.mode))
set(pipeline.args, which( pipeline.args$stat=='seqtime.mode' ), 'v', as.character(seqtime.mode))
set(pipeline.args, which( pipeline.args$stat=='s.PREV.max' ), 'v', as.character(s.PREV.max))
print(pipeline.args)
if(1)
{
# scenario E
infile.ind <- '150129_HPTN071_scE'
infile.trm <- '150129_HPTN071_scE'
outfile.ind <- '150407_Regional4Manon'
outfile.trm <- '150407_Regional4Manon'
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp150401-E'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',outfile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',outfile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(outfile.ind,label,'_IND.csv',sep=''), paste(outfile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
#system(file)
}
}, by='label']
}
}
project.PANGEA.TEST.pipeline.June2016.Josh<- function()
{
library(PANGEA.HIV.sim)
pipeline.args <- sim.regional.args( seed=43,
s.MODEL='Prop2SuggestedSampling',
report.prop.recent=1.0,
epi.acute='high', epi.intervention='none',
epi.import=0.05, root.edge.fixed=0,
wher.mu=log(0.00447743)-0.5^2/2, wher.sigma=0.5, bwerm.mu=log(0.002239075)-0.3^2/2, bwerm.sigma=0.3, er.gamma=4,
dbg.GTRparam=0, dbg.rER=0,
index.starttime.mode='fix1970', startseq.mode='one', seqtime.mode="AtDiag")
# proposed standard run and control simulation
pipeline.vary <- data.table( label= paste('-s',100*c(0.05,0.1,0.2,0.3,0.5,0.6,0.7,0.8),'-seed',43,sep=''),
s.PREV.max= c(0.05,0.1,0.2,0.3,0.5,0.6,0.7,0.8))
dummy <- pipeline.vary[1, {
set(pipeline.args, which( pipeline.args$stat=='s.PREV.max' ), 'v', as.character(s.PREV.max))
set(pipeline.args, which( pipeline.args$stat=='s.seed' ), 'v', as.character(43))
print(pipeline.args)
tmpdir <- '/Users/Oliver/duke/tmp/PANGEA-AcuteHigh-InterventionNone'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
file <- sim.regional(tmpdir, pipeline.args=pipeline.args)
#system(file)
}, by='label']
# produce 10 replicates for each sequence coverage level
# start with 40% which should correspond to 22.4%
pipeline.vary <- data.table( s.PREV.max=c(0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8),
cov=c(0.028,0.058,0.118,0.17,0.224,0.292,0.348,0.413,0.468))
pipeline.vary <- merge(pipeline.vary, as.data.table(expand.grid(s.PREV.max=c(0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8), seed=seq(43, length.out=9))), by='s.PREV.max')
pipeline.vary[, label:= paste('-cov',100*cov,'-seed',seed,sep='')]
pipeline.vary <- subset(pipeline.vary, s.PREV.max!=0.4 | seed!=43)
pipeline.vary <- subset(pipeline.vary, cov>0.4)
pipeline.vary <- subset(pipeline.vary, seed!=43)
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='s.PREV.max' ), 'v', as.character(s.PREV.max))
set(pipeline.args, which( pipeline.args$stat=='s.seed' ), 'v', as.character(seed))
print(pipeline.args)
tmpdir <- '/Users/Oliver/duke/tmp/PANGEA-AcuteHigh-InterventionNone'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
file <- sim.regional(tmpdir, pipeline.args=pipeline.args)
system(file)
#
}, by='label']
#
# new sampling model
pipeline.args <- sim.regional.args( seed=43,
s.MODEL='Prop2SuggestedSampling',
seqtime.mode="Weibull_2.8_4",
report.prop.recent=1.0,
epi.acute='high', epi.intervention='none',
epi.import=0.05, root.edge.fixed=0,
wher.mu=log(0.00447743)-0.5^2/2, wher.sigma=0.5, bwerm.mu=log(0.002239075)-0.3^2/2, bwerm.sigma=0.3, er.gamma=4,
dbg.GTRparam=0, dbg.rER=0,
index.starttime.mode='fix1970', startseq.mode='one')
pipeline.vary <- data.table( s.PREV.max=c(0.8),
cov=c(0.65))
pipeline.vary <- merge(pipeline.vary, as.data.table(expand.grid(s.PREV.max=c(0.8), seed=seq(43, length.out=10))), by='s.PREV.max')
pipeline.vary[, label:= paste('-cov',100*cov,'-seed',seed,sep='')]
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='s.PREV.max' ), 'v', as.character(s.PREV.max))
set(pipeline.args, which( pipeline.args$stat=='s.seed' ), 'v', as.character(seed))
print(pipeline.args)
tmpdir <- '/Users/Oliver/duke/tmp/PANGEA-AcuteHigh-InterventionNone'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
file <- sim.regional(tmpdir, pipeline.args=pipeline.args)
#system(file)
#
}, by='label']
}
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.pipeline.Apr2015.Manon.postprocess<- function()
{
indir <- '/Users/Oliver/git/HPTN071sim'
files <- list.files(path=indir, pattern='*INTERNAL.R$', recursive='TRUE')
files <- files[ grepl('150401', files)]
for(file in files)
{
load( paste(indir, file, sep='/') )
df.trms <- subset(df.trms, select=c(IDREC, IDTR, TIME_TR, SAMPLED_REC, SAMPLED_TR, IDCLU))
df.inds <- subset(df.inds, select=c(IDPOP, TIME_TR, GENDER, DOB, DOD, DIAG_T, TIME_SEQ, IDCLU))
tmp <- gsub('SIMULATED_INTERNAL','SIM',file)
save(df.trms, df.inds, df.seq, file= paste(indir, tmp, sep='/'))
}
}
##--------------------------------------------------------------------------------------------------------
## check simulated sequences: create ExaML tree and estimate R2
## olli 08.05.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.pipeline.Feb2015.plotsim<- function()
{
# get randomization names
set.seed(42)
indir <- '/Users/Oliver/duke/2014_Gates/methods_comparison_pipeline/FINAL'
dfi <- data.table(FILE=list.files(indir, '.*zip$', full.names=FALSE))
dfi[, SC:= sapply(strsplit(FILE, '_'),'[[',3)]
dfi[, CONFIG:= sapply(strsplit(SC, '-'),'[[',2)]
set(dfi, NULL, 'SC', dfi[, sapply(strsplit(SC, '-'),'[[',1)])
dfi[, DATAT:= sapply(strsplit(FILE, '_'),'[[',5)]
set(dfi, NULL, 'DATAT', dfi[, gsub('.zip','',DATAT,fixed=T)])
set(dfi, NULL, 'OBJECTIVE', 'SecondObj')
set(dfi, dfi[,which(CONFIG=='sq')],'OBJECTIVE', 'FirstObj')
dfi <- merge(dfi,dfi[, list(FILE=FILE, DUMMY=sample(length(FILE),length(FILE))), by='OBJECTIVE'],by=c('OBJECTIVE','FILE'))
tmp <- dfi[, which(OBJECTIVE=='SecondObj')]
set(dfi, tmp, 'DUMMY', dfi[tmp, DUMMY] + dfi[OBJECTIVE=='FirstObj', max(DUMMY)])
setkey(dfi, DUMMY)
dfi[, SC_RND:= toupper(letters[seq_len(nrow(dfi))])]
dfi[, GSUB_FROM:= sapply(strsplit(FILE, '_'),'[[',3)]
dfi[, GSUB_TO:= paste(OBJECTIVE,'_sc',SC_RND,sep='')]
# read df.epi for true simulations
indir <- '/Users/Oliver/Dropbox\ (Infectious Disease)/PANGEAHIVsim_internal/freeze_Jan15/regional/150208'
infiles <- data.table(FILE=list.files(indir, '.*INTERNAL.R$', full.names=FALSE, recursive=TRUE))
infiles[, BASENAME:= basename(FILE)]
infiles[, SC:= sapply(strsplit(BASENAME, '_'),'[[',3)]
infiles[, CONFIG:= sapply(strsplit(SC, '-'),'[[',2)]
set(infiles, NULL, 'SC', infiles[, sapply(strsplit(SC, '-'),'[[',1)])
infiles <- merge(infiles, subset(dfi, select=c(SC, CONFIG, DATAT, SC_RND, GSUB_FROM, GSUB_TO)), by=c('SC','CONFIG'))
set(infiles, NULL, 'BASENAME', NULL)
infiles <- subset(infiles, CONFIG=='sq' | SC%in%c('scE','scF'))
df.epi <- do.call('rbind',lapply(seq_len(nrow(infiles)), function(i)
{
file <- paste(indir, '/', infiles[i, FILE], sep='')
cat(paste('\nread file', infiles[i, FILE]))
load(file)
epi.adult <- 13
tp <- 1/12
suppressWarnings( df.trms[, MNTH:= df.trms[, floor(TIME_TR) + floor( (TIME_TR%%1)*100 %/% (tp*100) ) * tp]] )
df.epi <- df.trms[, list(INC=length(IDREC), INC_ACUTE=length(which(TR_ACUTE=='Yes')),IMPORT=length(which(IDTR<0))), by='MNTH']
setkey(df.epi, MNTH)
tmp <- df.epi[, {
sexactive <- which( (df.inds[['DOB']]+epi.adult-tp/2)<=MNTH & (df.inds[['DOD']]+tp/2)>MNTH )
infected.ever <- which( (df.inds[['TIME_TR']]-tp/2)<=MNTH )
infected <- which( (df.inds[['DOB']]-tp/2)<=MNTH & (df.inds[['DOD']]-tp/2)>MNTH & (df.inds[['TIME_TR']]-tp/2)<=MNTH )
diag <- which( (df.inds[['DOB']]-tp/2)<=MNTH & (df.inds[['DOD']]-tp/2)>MNTH & (df.inds[['DIAG_T']]-tp/2)<=MNTH )
diag.new <- which( (df.inds[['DIAG_T']]-tp/2)==MNTH )
treated <- which( (df.inds[['DOB']]-tp/2)<=MNTH & (df.inds[['DOD']]-tp/2)>MNTH & (df.inds[['ART1_T']]-tp/2)<=MNTH & (is.na(df.inds[['VLS1_TE']]) | (df.inds[['VLS1_TE']]-tp/2)>MNTH) )
infdead <- which( (df.inds[['DOD']]-tp/2)==MNTH & (df.inds[['TIME_TR']]-tp/2)<=MNTH )
sampled <- which( (df.inds[['DOB']]-tp/2)<=MNTH & (df.inds[['DOD']]-tp/2)>MNTH & (df.inds[['TIME_SEQ']]-tp/2)<=MNTH )
list(POP=length(sexactive), PREV=length(infected), PREV_EVER=length(infected.ever), PREVDIED=length(infdead), DIAG=length(diag), NEW_DIAG=length(diag.new), TREATED=length(treated), SEQ=length(sampled))
},by='MNTH']
df.epi <- merge( tmp, df.epi, by='MNTH' )
set(df.epi, NULL, 'PREVp', df.epi[, PREV/(POP-PREV)])
set(df.epi, NULL, 'SEQp', df.epi[, SEQ/PREV])
set(df.epi, NULL, 'ARTcov', df.epi[, TREATED/PREV])
set(df.epi, NULL, 'UNDIAGp', df.epi[, (PREV-DIAG)/PREV])
set(df.epi, NULL, 'GROWTHr', c(NA_real_, df.epi[, diff(log(PREV))]))
df.epi[, DUMMY:= seq_len(nrow(df.epi))]
tmp <- df.epi[, {
tmp <- seq.int(DUMMY-6,DUMMY+5)
tmp[tmp<1] <- 1
tmp[tmp>nrow(df.epi)] <- nrow(df.epi)
list( INCp= sum(df.epi[['INC']][tmp]) / mean(df.epi[['POP']][tmp]-df.epi[['PREV']][tmp]),
IMPORTp= sum(df.epi[['IMPORT']][tmp]) / sum(df.epi[['INC']][tmp]),
ACUTEp= sum(df.epi[['INC_ACUTE']][tmp]) / sum(df.epi[['INC']][tmp])
)
}, by='DUMMY']
df.epi <- merge(df.epi, tmp, by='DUMMY')
df.epi[, SC:= infiles[i, substr(SC,3,3)]]
df.epi[, DUMMY:=NULL]
df.epi
}))
# plot prevalence
df.epi <- melt(df.epi, id.vars=c('SC', 'MNTH'))
df <- subset(df.epi, variable%in%c('POP','PREV','INC','PREVp','INCp','SEQp','ARTcov','UNDIAGp','ACUTEp','IMPORTp') & MNTH>=1985)
tmp <- data.table( variable=c('POP','PREV','INC','PREVp','INCp','SEQp','ARTcov','UNDIAGp','ACUTEp','IMPORTp'),
legend=c('adult\npopulation', 'HIV infected', 'new cases\n(month)','Prevalence\n(%)','%Incidence\n(year)','Sequence\ncoverage\n(%)','ART\ncoverage\n(%)','Undiagnosed\n(%)','% transm\nfrom < 3m\n(yr)','% transm\nfrom outside\n(yr)'))
set(tmp, NULL, 'legend', tmp[, factor(legend, levels=tmp$legend, labels=tmp$legend)])
df <- merge(df, tmp, by='variable')
tmp <- df[, which(!variable%in%c('POP','PREV','INC'))]
set(df, tmp,'value', df[tmp, value*100])
ggplot(df, aes(x=MNTH, y=value, group=SC, colour=SC)) + geom_line() +
scale_colour_brewer(palette='Set1') +
facet_wrap(~legend, scales='free',ncol=3) +
theme_bw() + labs(x='', y='',colour='epidemic scenario', title='Regional\n(since 1985)\n') +
theme(legend.position='bottom')
outdir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim_internal/documents/external/2015_05_results/results'
ggsave(file=paste(outdir,'/regional_sincestart.pdf',sep=''), w=10, h=10)
df <- subset(df.epi, variable%in%c('POP','PREV','INC','PREVp','INCp','SEQp','ARTcov','UNDIAGp','ACUTEp','IMPORTp') & MNTH>=2014.5)
tmp <- data.table( variable=c('POP','PREV','INC','PREVp','INCp','SEQp','ARTcov','UNDIAGp','ACUTEp','IMPORTp'),
legend=c('adult\npopulation', 'HIV infected', 'new cases\n(month)','Prevalence\n(%)','%Incidence\n(year)','Sequence\ncoverage\n(%)','ART\ncoverage\n(%)','Undiagnosed\n(%)','% transm\nfrom < 3m\n(yr)','% transm\nfrom outside\n(yr)'))
set(tmp, NULL, 'legend', tmp[, factor(legend, levels=tmp$legend, labels=tmp$legend)])
df <- merge(df, tmp, by='variable')
tmp <- df[, which(!variable%in%c('POP','PREV','INC'))]
set(df, tmp,'value', df[tmp, value*100])
ggplot(df, aes(group=SC, colour=SC)) +
geom_rect(data=NULL, aes(xmin=2015, xmax=2018, ymin=-Inf, ymax=Inf), fill='grey80', colour='transparent', alpha=0.2) +
geom_line(aes(x=MNTH, y=value)) +
scale_colour_brewer(palette='Set1') +
facet_wrap(~legend, scales='free',ncol=3) +
theme_bw() + labs(x='', y='',colour='epidemic scenario', title='Regional\n(since start of intervention)\n') +
theme(legend.position='bottom')
outdir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim_internal/documents/external/2015_05_results/results'
ggsave(file=paste(outdir,'/regional_sinceintervention.pdf',sep=''), w=10, h=10)
}
##--------------------------------------------------------------------------------------------------------
## check simulated sequences: create ExaML tree and estimate R2
## olli 08.05.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.pipeline.Feb2015.resultsforsim<- function()
{
# get randomization names
set.seed(42)
indir <- '/Users/Oliver/duke/2014_Gates/methods_comparison_pipeline/FINAL'
dfi <- data.table(FILE=list.files(indir, '.*zip$', full.names=FALSE))
dfi[, SC:= sapply(strsplit(FILE, '_'),'[[',3)]
dfi[, CONFIG:= sapply(strsplit(SC, '-'),'[[',2)]
set(dfi, NULL, 'SC', dfi[, sapply(strsplit(SC, '-'),'[[',1)])
dfi[, DATAT:= sapply(strsplit(FILE, '_'),'[[',5)]
set(dfi, NULL, 'DATAT', dfi[, gsub('.zip','',DATAT,fixed=T)])
set(dfi, NULL, 'OBJECTIVE', 'SecondObj')
set(dfi, dfi[,which(CONFIG=='sq')],'OBJECTIVE', 'FirstObj')
dfi <- merge(dfi,dfi[, list(FILE=FILE, DUMMY=sample(length(FILE),length(FILE))), by='OBJECTIVE'],by=c('OBJECTIVE','FILE'))
tmp <- dfi[, which(OBJECTIVE=='SecondObj')]
set(dfi, tmp, 'DUMMY', dfi[tmp, DUMMY] + dfi[OBJECTIVE=='FirstObj', max(DUMMY)])
setkey(dfi, DUMMY)
dfi[, SC_RND:= toupper(letters[seq_len(nrow(dfi))])]
dfi[, GSUB_FROM:= sapply(strsplit(FILE, '_'),'[[',3)]
dfi[, GSUB_TO:= paste(OBJECTIVE,'_sc',SC_RND,sep='')]
# read df.epi for true simulations
indir <- '/Users/Oliver/Dropbox\ (Infectious Disease)/PANGEAHIVsim_internal/freeze_Jan15/regional/150208'
infiles <- data.table(FILE=list.files(indir, '.*INTERNAL.R$', full.names=FALSE, recursive=TRUE))
infiles[, BASENAME:= basename(FILE)]
infiles[, SC:= sapply(strsplit(BASENAME, '_'),'[[',3)]
infiles[, CONFIG:= sapply(strsplit(SC, '-'),'[[',2)]
set(infiles, NULL, 'SC', infiles[, sapply(strsplit(SC, '-'),'[[',1)])
infiles <- merge(infiles, subset(dfi, select=c(SC, CONFIG, DATAT, SC_RND, GSUB_FROM, GSUB_TO)), by=c('SC','CONFIG'))
set(infiles, NULL, 'BASENAME', NULL)
df.epi <- do.call('rbind',lapply(seq_len(nrow(infiles)), function(i)
{
file <- paste(indir, '/', infiles[i, FILE], sep='')
cat(paste('\nread file', infiles[i, FILE]))
load(file)
tmp <- cbind( df.epi, infiles[i, -which(grepl('FILE', names(infiles))), with=0] )
tmp
}))
cat(paste('\nread data for scenarios, n=',df.epi[, length(unique(SC_RND))]))
# get objectives for true simulations
# obj i: A-D decreasing, E-F stable
#subset(df.epi, YR==2019)
tmp <- data.table( SC=paste('sc',c('A','B','C','D','E','F'),sep=''), OBJ_i= c('decreasing','decreasing','decreasing','decreasing','stable','stable'))
dfo <- merge(df.epi, tmp, by='SC')
# obj ii: %incidence in last year of eval period
tmp <- unique(subset(df.epi, select=CONFIG))
tmp[, YR:=2018]
set(tmp, tmp[, which(CONFIG=='y3')], 'YR', 2016)
tmp <- subset(merge(df.epi, tmp, by=c('CONFIG','YR')), select=c(SC, CONFIG, INCp, YR))
setkey(tmp, SC)
setnames(tmp, c('INCp','YR'), c('OBJ_ii','OBJ_ii_te'))
dfo <- merge(dfo, tmp, by=c('SC','CONFIG'))
# obj iii: ratio incidence
tmp <- merge(tmp, subset(df.epi, YR==2014, select=c(SC, CONFIG, INCp)), by=c('SC','CONFIG'))
set(tmp, NULL, 'OBJ_iii', tmp[, OBJ_ii/INCp] )
dfo <- merge(dfo, subset(tmp, select=c(SC, CONFIG, OBJ_iii)), by=c('SC','CONFIG'))
# obj iv: %acute
#subset(df.epi, YR==2014)
tmp <- data.table( SC=paste('sc',c('A','B','C','D','E','F'),sep=''), OBJ_iv= c('<10%','>30%','<10%','>30%','<10%','>30%'))
dfo <- merge(dfo, tmp, by='SC')
# obj v: %acute at baseline
tmp <- subset(df.epi, YR==2014, select=c(SC, CONFIG, ACUTEp))
setkey(tmp, SC)
setnames(tmp, 'ACUTEp', 'OBJ_v')
dfo <- merge(dfo, tmp, by=c('SC','CONFIG'))
# obj vi: %acute at last yr of eval period
tmp <- unique(subset(df.epi, select=CONFIG))
tmp[, YR:=2018]
set(tmp, tmp[, which(CONFIG=='y3')], 'YR', 2016)
tmp <- subset(merge(df.epi, tmp, by=c('CONFIG','YR')), select=c(SC, CONFIG, ACUTEp, YR))
setkey(tmp, SC)
setnames(tmp, c('ACUTEp','YR'),c('OBJ_vi','OBJ_vi_te'))
dfo <- merge(dfo, tmp, by=c('SC','CONFIG'))
# write csv file
setkey(dfo, SC, CONFIG)
ans <- unique(dfo)
ans <- subset(ans, select=c(SC, CONFIG, DATAT, SC_RND, GSUB_FROM, GSUB_TO, OBJ_i, OBJ_ii, OBJ_ii_te, OBJ_iii, OBJ_iv, OBJ_v, OBJ_vi, OBJ_vi_te))
ans[, TEAM:='True']
ans[, SUBMISSION_DATE:='08.05.2015']
ans[, SIM_SCENARIO:= paste('150129_PANGEAsim_Regional_',GSUB_TO,'_SIMULATED_',DATAT,sep='')]
ans[, USED_GENES:='all']
ans[, ESTIMATE:='central']
outdir <- '/Users/Oliver/Dropbox\ (Infectious Disease)/PANGEAHIVsim_internal/documents/external/2015_05_results'
file <- paste(outdir, '/answers_Regional_Feb2015_rFormat.csv', sep='')
write.csv(subset(ans, select=c(TEAM, SUBMISSION_DATE, SIM_SCENARIO, USED_GENES, OBJ_i, OBJ_ii, OBJ_iii, OBJ_iv, OBJ_v, OBJ_vi, ESTIMATE)), file=file, row.names=FALSE)
setnames(ans, 'OBJ_ii_te','te')
ans[, tb:=2014]
file <- paste(outdir, '/Regional_Feb2015_tb_te.csv', sep='')
write.csv(subset(ans, select=c(SIM_SCENARIO, tb, te)), file=file, row.names=FALSE)
}
##--------------------------------------------------------------------------------------------------------
## evaluate results
## olli 08.05.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.pipeline.Feb2015.evaluate<- function()
{
require(RColorBrewer)
dfa <- project.PANGEA.TEST.pipeline.Feb2015.evaluate.read()
outdir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim_internal/documents/external/2015_05_results/results'
save(dfa, file=paste(outdir,'/submissions.R',sep=''))
load(paste(outdir,'/submissions.R',sep=''))
# set answers to numerical
set(dfa, dfa[, which(OBJ%in%c('OBJ_i','OBJ_iv'))], c('lower95','upper95'), NA_character_)
set(dfa, dfa[, which(central=='decreasing')], c('central'), '-1')
set(dfa, dfa[, which(central=='stable')], c('central'), '0')
set(dfa, dfa[, which(central=='increasing')], c('central'), '1')
set(dfa, dfa[, which(central=='<15%')], c('central'), '-1')
set(dfa, dfa[, which(central=='15%-30%')], c('central'), '0')
set(dfa, dfa[, which(central=='>30%')], c('central'), '1')
set(dfa, NULL, 'central', dfa[, as.numeric(central)])
set(dfa, NULL, 'lower95', dfa[, as.numeric(lower95)])
set(dfa, NULL, 'upper95', dfa[, as.numeric(upper95)])
# add simulation type
dfa[, DATAT_L:='NA_character_']
set(dfa, dfa[, which(grepl('Vill',SIM_SCENARIO))], 'DATAT_L','Village')
set(dfa, dfa[, which(grepl('Regional',SIM_SCENARIO))], 'DATAT_L','Regional')
# add objective legend
dfa <- merge(dfa, data.table(USED_GENES=c('pol','all'), USED_GENES_L=c('pol gene','pol+gag+env\ngenome') ), by='USED_GENES')
set(dfa, NULL, 'TEAM', dfa[, factor(TEAM)])
tmp <- data.table( OBJ= c('OBJ_i','OBJ_ii','OBJ_iii','OBJ_iv','OBJ_v','OBJ_vi'),
OBJ_L= c('Incidence\nTrend', '%Incidence', 'Incidence\nreduction', '%Acute Ctgr\n(baseline)', '%Acute\n(baseline)', '%Acute\n(endpoint)'))
set(tmp, NULL, 'OBJ_L2', tmp[, factor(OBJ_L, levels=OBJ_L, labels=OBJ_L)])
set(tmp, NULL, 'OBJ_L', tmp[, factor(OBJ_L, levels=rev(OBJ_L), labels=rev(OBJ_L))])
dfa <- merge(dfa, tmp, by='OBJ')
# add data legend
dfa[, DATA_T2:='NA_character_']
set(dfa, dfa[, which(DATA_T=='seq')], 'DATA_T2', 'using\nsequences')
set(dfa, dfa[, which(DATA_T=='phy')], 'DATA_T2', 'using\ntrue tree')
set(dfa, NULL, 'DATA_T2', dfa[, factor(DATA_T2, levels=rev(c('using\nsequences','using\ntrue tree')), labels=rev(c('using\nsequences','using\ntrue tree')))])
# add scenario type
set(dfa, NULL, 'DATA_T', dfa[, factor(DATA_T, levels=c('seq','phy'), labels=c('seq','phy'))])
set(dfa, NULL, 'INT_T', dfa[, factor(INT_T, levels=c('fast','slow','none'), labels=c('fast','slow','none'))])
set(dfa, NULL, 'AC_T', dfa[, factor(AC_T, levels=c('low','high'), labels=c('low','high'))])
set(dfa, NULL, 'IMPRT', dfa[, factor(IMPRT*100, levels=c(5,20,2,0), labels=paste(c(5,20,2,0),'%',sep=''))])
set(dfa, NULL, 'SMPL_C', dfa[, factor(SMPL_C*100, levels=c(8, 16, 30, 60), labels=paste(c(8, 16, 30, 60),'%',sep=''))])
set(dfa, NULL, 'SMPL_D', dfa[, factor(SMPL_D, levels=c(5,3), labels=c(5,3))])
set(dfa, dfa[, which(SMPL_M=='overs')], 'SMPL_M', 'much')
set(dfa, dfa[, which(SMPL_M=='extrs')], 'SMPL_M', 'extreme')
set(dfa, dfa[, which(is.na(SMPL_M))], 'SMPL_M', 'extreme')
set(dfa, NULL, 'SMPL_M', dfa[, factor(SMPL_M, levels=c('much','extreme'), labels=c('much','extreme'))])
tmp <- unique(subset( dfa, select=c(DATAT_L, SC_RND, DATA_T, SC, AC_T, INT_T, IMPRT, SMPL_N, SMPL_C, SMPL_M, SMPL_D) ))
setkey(tmp, DATAT_L, AC_T, INT_T, DATA_T, IMPRT, SMPL_C, SMPL_D, SMPL_M)
tmp[, SCENARIO_L:= paste('%AC=',AC_T,' ARTup=',INT_T,' EXT=',IMPRT,'\n',DATA_T,' ',SMPL_N,' ',SMPL_C,' ',SMPL_D,' ',SMPL_M, ' (',SC_RND,')',sep='')]
dfa <- merge(dfa, subset(tmp, select=c(SC_RND, SCENARIO_L)), by='SC_RND')
# add intervention legend
dfa[, INT_L:= dfa[, paste('ART scale up\n',as.character(INT_T),sep='')]]
setkey(dfa, INT_T)
set(dfa, NULL, 'INT_L', dfa[, factor(INT_L, levels=dfa[, unique(INT_L)], labels=dfa[, unique(INT_L)])])
# add %Acute legend
dfa[, AC_L:= dfa[, paste('%Acute\n',as.character(AC_T),sep='')]]
setkey(dfa, AC_T)
set(dfa, NULL, 'AC_L', dfa[, factor(AC_L, levels=dfa[, unique(AC_L)], labels=dfa[, unique(AC_L)])])
# add team color
TEAM_CL <- brewer.pal(dfa[,length(unique(TEAM))], 'Paired')
names(TEAM_CL) <- dfa[, unique(TEAM)]
TEAM_CL[7] <- "#386CB0"
TEAM_CL[3] <- "#FF7F00"
# count total submissions primary vs secondary
tmp <- subset(dfa, TEAM!='True' & !grepl('(', TEAM, fixed=1))
tmp <- tmp[, list( Village=length(which(grepl('Vill',SIM_SCENARIO))), Regional=length(which(grepl('Regional',SIM_SCENARIO)))), by=c('TEAM','OBJ_L','USED_GENES_L')]
tmp <- melt(tmp, measure.vars=c('Village','Regional'))
ggplot(tmp, aes(x=OBJ_L, y=value, fill=TEAM)) + geom_bar(stat='identity') +
facet_grid(USED_GENES_L~variable) +
guides(fill=guide_legend(ncol=2)) +
scale_fill_manual(values=TEAM_CL) +
labs(x='', y='submissions\n(#)', title='Total scenarios submitted\n(using sequence data or true trees)\n') +
theme_bw()+ theme(legend.position='bottom') + coord_flip()
ggsave(file=paste(outdir,'/res_scenarios_total.pdf',sep=''), w=10, h=8)
# count all submissions for primary objectives
tmp <- subset(dfa, TEAM!='True' & !grepl('(', TEAM, fixed=1) & DATA_T=='seq')
tmp <- tmp[, list( Village=length(which(grepl('Vill',SIM_SCENARIO))), Regional=length(which(grepl('Regional',SIM_SCENARIO)))), by=c('TEAM','OBJ_L')]
tmp <- melt(tmp, measure.vars=c('Village','Regional'))
ggplot(tmp, aes(x=OBJ_L, y=value, fill=TEAM)) + geom_bar(stat='identity') +
facet_grid(~variable) +
labs(x='', y='submissions\n(#)', title='Total scenarios submitted\n(using sequence data)\n') +
scale_fill_manual(values=TEAM_CL) +
guides(fill=guide_legend(ncol=2)) +
theme_bw() + theme(legend.position='bottom') + coord_flip()
ggsave(file=paste(outdir,'/res_scenarios_total_seqonly.pdf',sep=''), w=10, h=5)
# count complete submissions for primary objectives
tmp <- subset(dfa, TEAM!='True' & !grepl('(', TEAM, fixed=1) & DATA_T=='seq')
tmp <- tmp[, list( Village=as.numeric(length(setdiff(c('01','02','03','04'),SC_RND))==0), Regional=as.numeric(length(setdiff(c('A','B','C','D'),SC))==0)), by=c('TEAM','OBJ_L','USED_GENES_L')]
tmp <- melt(tmp, measure.vars=c('Village','Regional'))
ggplot(tmp, aes(x=OBJ_L, y=value, fill=TEAM)) + geom_bar(stat='identity') +
facet_grid(USED_GENES_L~variable) +
scale_y_continuous(breaks=seq(1,10,1), minor_breaks=NULL) +
scale_fill_manual(values=TEAM_CL) +
labs(x='', y='complete set of 4 submissions\n(#)', title='Complete submissions to evalute primary objectives\n(either village or regional)') +
guides(fill=guide_legend(ncol=2)) +
theme_bw() + theme(legend.position='bottom') + coord_flip()
ggsave(file=paste(outdir,'/res_scenarios_total_seqonlycomplete.pdf',sep=''), w=10, h=7)
#
# show village / regional: Imports, Total Sequence, Sequence Coverage
#
# for each team
# all results
invisible(sapply(setdiff(dfa[, unique(TEAM)],'True'), function(x)
{
#x <- 'Imperial'
df <- subset(dfa, (TEAM=='True' | TEAM==x) & USED_GENES=='all')
set(df, df[, which(TEAM==x)], 'TEAM', 'estimate')
set(df, df[, which(TEAM=='True')], 'TEAM', 'true value')
set(df, NULL, 'TEAM', df[, factor(TEAM, levels=c('estimate','true value'), labels=c('estimate','true value'))])
ggplot(df, aes(y=SCENARIO_L, x=central, xmin=lower95, xmax=upper95, colour=TEAM, pch=TEAM)) +
geom_errorbarh(height=0.3) + geom_point(size=3) +
scale_colour_manual(values = c("red","black")) +
scale_shape_manual(values = c(13,18), guide = FALSE) +
labs(x='', y='', title= paste('TEAM',x,'\n'), colour='') +
facet_grid(DATAT_L~OBJ_L2, scales='free', space='free_y') +
theme_bw() + theme(legend.position='bottom')
ggsave(file=paste(outdir,'/res_obj_TEAM_',gsub(' ','_',gsub('\\/|\\(|\\)','',x)),'.pdf',sep=''), w=14, h=0.5*df[, length(unique(SCENARIO_L))])
# results using seq data
df <- subset(dfa, (TEAM=='True' | TEAM==x) & USED_GENES=='all' & DATA_T=='seq')
set(df, df[, which(TEAM==x)], 'TEAM', 'estimate')
set(df, df[, which(TEAM=='True')], 'TEAM', 'true value')
set(df, NULL, 'TEAM', df[, factor(TEAM, levels=c('estimate','true value'), labels=c('estimate','true value'))])
ggplot(df, aes(y=SCENARIO_L, x=central, xmin=lower95, xmax=upper95, colour=TEAM, pch=TEAM)) +
geom_errorbarh(height=0.3) + geom_point(size=3) +
scale_colour_manual(values = c("red","black")) +
scale_shape_manual(values = c(13,18), guide = FALSE) +
labs(x='', y='', title= paste('TEAM',x,'\n'), colour='') +
facet_grid(DATAT_L~OBJ_L2, scales='free', space='free_y') +
theme_bw() + theme(legend.position='bottom')
ggsave(file=paste(outdir,'/res_objonseq_TEAM_',gsub(' ','_',gsub('\\/|\\(|\\)','',x)),'.pdf',sep=''), w=14, h=0.7*df[, length(unique(SCENARIO_L))])
}))
# for each objective
# compare results across teams
require("grid")
# compare objectives with / without seq data, village + regional
df <- subset(dfa, DATA_T=='seq')
# regional, trees corresponding to seq data sets
tmp <- subset(dfa, DATA_T=='phy' & SMPL_D=='5' & SMPL_M=='much' & SMPL_C=='8%' & SMPL_N==1600 & IMPRT=='5%')
df <- rbind(df, tmp)
# village, trees corresponding to seq data sets
tmp <- subset(dfa, SC_RND%in%c('11','09','12','10','00'))
df <- rbind(df, tmp)
tmp <- subset(df, OBJ%in%c('OBJ_ii','OBJ_iii','OBJ_v','OBJ_vi') & !grepl('(',TEAM,fixed=1) & DATAT_L=='Regional' & USED_GENES=='all')
tmp2<- as.data.table(expand.grid(central=c(0.01,0.55), AC_L=c('%Acute\nlow','%Acute\nhigh'), INT_L=c('ART scale up\nfast','ART scale up\nslow','ART scale up\nnone'), TEAM='dummy', DATA_T2='using\ntrue tree', OBJ_L2=c('%Acute\n(baseline)','%Acute\n(endpoint)')))
set(tmp2, NULL, 'INT_L', tmp2[, factor(INT_L, levels=c('ART scale up\nfast','ART scale up\nslow','ART scale up\nnone'), labels=c('ART scale up\nfast','ART scale up\nslow','ART scale up\nnone'))])
setnames(tmp2, 'TEAM','team')
tmp3 <- subset(tmp, TEAM=='True')
setnames(tmp3, 'TEAM','team')
ggplot(subset(tmp, TEAM!='True'), aes(y=INT_L, x=central)) +
geom_point(data=tmp2, size=1, colour='transparent') +
geom_errorbarh(aes(xmin=lower95, xmax=upper95, colour=TEAM), height=0.3) +
geom_point(size=4, aes(colour=TEAM), pch=13) +
geom_point(data=tmp3, size=3, colour='black', pch=18) +
scale_colour_manual(values=TEAM_CL) +
facet_grid(TEAM+DATA_T2~OBJ_L2+AC_L, scales='free', space='free_y') +
theme_bw() + theme(panel.margin.x= unit(0.5, "lines"), legend.position='bottom') +
guides(colour=guide_legend(ncol=2)) +
labs(x= '\nEstimates', y='', title='Primary objectives: quantitative\n(Regional)\n')
ggsave(file=paste(outdir,'/res_acrossTEAM_primaryregional_nmbrs','.pdf',sep=''), w=13, h=10)
# qualitative
tmp <- subset(df, OBJ%in%c('OBJ_i','OBJ_iv') & !grepl('(',TEAM,fixed=1) & DATAT_L=='Regional' & USED_GENES=='all')
set(tmp, NULL, 'central', tmp[, as.character(central)])
set(tmp, tmp[, which(OBJ=='OBJ_i' & central=='-1')], 'central', 'declining')
set(tmp, tmp[, which(OBJ=='OBJ_i' & central=='0')], 'central', 'stable')
set(tmp, tmp[, which(OBJ=='OBJ_i' & central=='1')], 'central', 'increasing')
set(tmp, tmp[, which(OBJ=='OBJ_iv' & central=='-1')], 'central', '<15%')
set(tmp, tmp[, which(OBJ=='OBJ_iv' & central=='0')], 'central', '15%-30%')
set(tmp, tmp[, which(OBJ=='OBJ_iv' & central=='1')], 'central', '>30%')
tmp2 <- c('declining','stable','increasing','<15%','15%-30%','>30%')
set(tmp, NULL, 'central', tmp[, factor(central, levels=tmp2, labels=tmp2)])
tmp2 <- as.data.table(expand.grid(central=tmp2, AC_T=c('low','high'), INT_L=c('ART scale up\nfast','ART scale up\nslow','ART scale up\nnone'), TEAM='dummy', DATA_T2='using\ntrue tree', OBJ_L2='Incidence\nTrend'))
set(tmp2, tmp2[, which(grepl('%', central))], 'OBJ_L2', '%Acute Ctgr\n(baseline)')
set(tmp2, NULL, 'INT_L', tmp2[, factor(INT_L, levels=c('ART scale up\nfast','ART scale up\nslow','ART scale up\nnone'), labels=c('ART scale up\nfast','ART scale up\nslow','ART scale up\nnone'))])
setkey(tmp2, central)
ggplot(tmp2, aes(y=INT_L, x=central, colour=TEAM)) +
#geom_errorbarh(height=0.3) +
geom_point(data=tmp2, size=1, colour='transparent') +
geom_jitter(data=subset(tmp, TEAM!='True'), size=3, pch=13, position = position_jitter(width=0, height=.15)) +
geom_point(data=subset(tmp, TEAM=='True'), size=3, colour='black', pch=18) +
scale_colour_manual(values=TEAM_CL) +
facet_grid(DATA_T2~OBJ_L2+AC_L, scales='free', space='free_y') +
theme_bw() + theme(panel.margin.x= unit(0.5, "lines"), legend.position='bottom', panel.grid.minor= element_blank(), panel.grid.major= element_blank()) +
guides(colour=guide_legend(ncol=2)) +
labs(x= '\nEstimates', y='', title='Primary objectives: qualitatitve\n(Regional)\n')
ggsave(file=paste(outdir,'/res_acrossTEAM_primaryregional_qultv','.pdf',sep=''), w=13, h=6)
#invisible(lapply(c('OBJ_ii','OBJ_iii','OBJ_v','OBJ_vi'), function(x)
# {
# #x <- 'OBJ_ii'
# tmp <- subset(df, OBJ==x & !grepl('(',TEAM,fixed=1) & DATAT_L=='Regional' & USED_GENES=='all')
# if(x=='OBJ_i')
# set(tmp, NULL, 'central', tmp[, factor(central, levels=c(1,0,-1), labels=c('increasing','stable','decreasing'))])
# if(x=='OBJ_iv')
# set(tmp, NULL, 'central', tmp[, factor(central, levels=c(1,0,-1), labels=c('>30%','15%-30%','<15%'))])
# ggplot(subset(tmp, TEAM!='True'), aes(y=DATA_T, x=central, xmin=lower95, xmax=upper95, colour=TEAM)) +
# geom_errorbarh(height=0.3) + geom_point(size=3, pch=13) +
# geom_point(data=subset(tmp, TEAM=='True'), size=3, colour='black', pch=18) +
# scale_colour_manual(values=TEAM_CL) +
# facet_grid(AC_L~INT_L, scales='free', space='free_y') +
# theme_bw() + theme(panel.margin.x= unit(2, "lines")) +
# labs(x= paste('\n',gsub('\n',' ',tmp[1, OBJ_L]),sep=''), y='')
# ggsave(file=paste(outdir,'/res_acrossTEAM_primaryregional_',x,'.pdf',sep=''), w=10, h=3)
# }))
# %INCIDENCE
tmp <- subset(df, OBJ%in%c('OBJ_ii') & !grepl('(',TEAM,fixed=1) & DATAT_L=='Village' & USED_GENES=='all')
set(tmp, NULL, 'OBJ_L2', tmp[, factor(as.character(OBJ_L2))])
tmp2<- as.data.table(expand.grid(central=c(0.01,0.03), AC_L=c('%Acute\nlow','%Acute\nhigh'), INT_L=c('ART scale up\nfast','ART scale up\nslow','ART scale up\nnone'), TEAM='dummy', DATA_T2='using\ntrue tree', OBJ_L2=c('%Incidence')))
set(tmp2, NULL, 'INT_L', tmp2[, factor(INT_L, levels=c('ART scale up\nfast','ART scale up\nslow','ART scale up\nnone'), labels=c('ART scale up\nfast','ART scale up\nslow','ART scale up\nnone'))])
setnames(tmp2, 'TEAM','team')
tmp3 <- subset(tmp, TEAM=='True')
setnames(tmp3, 'TEAM','team')
ggplot(subset(tmp, TEAM!='True'), aes(y=INT_L, x=central*100, colour=TEAM)) +
geom_point(data=tmp2, size=1, colour='transparent') +
geom_errorbarh(aes(xmin=lower95*100, xmax=upper95*100), height=0.3) + geom_point(size=3, pch=13) +
geom_point(data=tmp3, size=3, colour='black', pch=18) +
scale_colour_manual(values=TEAM_CL) +
facet_grid(AC_L+DATA_T2~TEAM, scales='free', space='free_y') +
theme_bw() + theme(panel.margin.x= unit(0.5, "lines"), legend.position='bottom') +
guides(colour=guide_legend(ncol=2)) +
labs(x= '%', y='', title='Primary objectives: %Incidence\n(Village)\n')
ggsave(file=paste(outdir,'/res_acrossTEAM_primaryvillage_pcinc','.pdf',sep=''), w=13, h=8)
ggplot(subset(tmp, TEAM!='True'), aes(y=INT_L, x=central*100, colour=TEAM)) +
geom_point(data=tmp2, size=1, colour='transparent') +
geom_errorbarh(aes(xmin=lower95*100, xmax=upper95*100), height=0.3) + geom_point(size=3, pch=13) +
geom_point(data=tmp3, size=3, colour='black', pch=18) +
scale_colour_manual(values=TEAM_CL) +
facet_grid(AC_L+DATA_T2~TEAM, scales='free_y', space='free_y') +
theme_bw() + theme(panel.margin.x= unit(0.5, "lines"), legend.position='bottom') +
guides(colour=guide_legend(ncol=2)) +
labs(x= '%', y='', title='Primary objectives: %Incidence\n(Village)\n')
ggsave(file=paste(outdir,'/res_acrossTEAM_primaryvillage_pcinc_ssc','.pdf',sep=''), w=13, h=8)
# REDUCTION INCIDENCE
tmp <- subset(df, OBJ%in%c('OBJ_iii') & !grepl('(',TEAM,fixed=1) & DATAT_L=='Village' & USED_GENES=='all')
set(tmp, NULL, 'OBJ_L2', tmp[, factor(as.character(OBJ_L2))])
tmp2<- as.data.table(expand.grid(central=c(0.01,0.03), AC_L=c('%Acute\nlow','%Acute\nhigh'), INT_L=c('ART scale up\nfast','ART scale up\nslow','ART scale up\nnone'), TEAM='dummy', DATA_T2='using\ntrue tree', OBJ_L2=c('Incidence\nreduction')))
set(tmp2, NULL, 'INT_L', tmp2[, factor(INT_L, levels=c('ART scale up\nfast','ART scale up\nslow','ART scale up\nnone'), labels=c('ART scale up\nfast','ART scale up\nslow','ART scale up\nnone'))])
setnames(tmp2, 'TEAM','team')
tmp3 <- subset(tmp, TEAM=='True')
setnames(tmp3, 'TEAM','team')
ggplot(subset(tmp, TEAM!='True'), aes(y=INT_L, x=central, colour=TEAM)) +
geom_point(data=tmp2, size=1, colour='transparent') +
geom_vline(xintercept=1, colour='grey50', lwd=1) +
geom_errorbarh(aes(xmin=lower95, xmax=upper95), height=0.3) + geom_point(size=3, pch=13) +
geom_point(data=tmp3, size=3, colour='black', pch=18) +
scale_colour_manual(values=TEAM_CL) +
facet_grid(AC_L+DATA_T2~TEAM, scales='free', space='free_y') +
theme_bw() + theme(panel.margin.x= unit(0.5, "lines"), legend.position='bottom') +
guides(colour=guide_legend(ncol=2)) +
labs(x= '%', y='', title='Primary objectives: Incidence Reduction\n(Village)\n')
ggsave(file=paste(outdir,'/res_acrossTEAM_primaryvillage_redinc','.pdf',sep=''), w=13, h=8)
ggplot(subset(tmp, TEAM!='True'), aes(y=INT_L, x=central, colour=TEAM)) +
geom_point(data=tmp2, size=1, colour='transparent') +
geom_vline(xintercept=1, colour='grey50', lwd=1) +
geom_errorbarh(aes(xmin=lower95, xmax=upper95), height=0.3) + geom_point(size=3, pch=13) +
geom_point(data=tmp3, size=3, colour='black', pch=18) +
scale_colour_manual(values=TEAM_CL) +
facet_grid(AC_L+DATA_T2~TEAM, scales='free_y', space='free_y') +
theme_bw() + theme(panel.margin.x= unit(0.5, "lines"), legend.position='bottom') +
guides(colour=guide_legend(ncol=2)) +
labs(x= '%', y='', title='Primary objectives: Incidence Reduction\n(Village)\n')
ggsave(file=paste(outdir,'/res_acrossTEAM_primaryvillage_redinc_ssc','.pdf',sep=''), w=13, h=8)
# %ACUTE BASELINE
tmp <- subset(df, OBJ%in%c('OBJ_v') & !grepl('(',TEAM,fixed=1) & DATAT_L=='Village' & USED_GENES=='all')
set(tmp, NULL, 'OBJ_L2', tmp[, factor(as.character(OBJ_L2))])
tmp2<- as.data.table(expand.grid(central=c(0.01,0.03), AC_L=c('%Acute\nlow','%Acute\nhigh'), INT_L=c('ART scale up\nfast','ART scale up\nslow','ART scale up\nnone'), TEAM='dummy', DATA_T2='using\ntrue tree', OBJ_L2=c('%Acute\n(baseline)')))
set(tmp2, NULL, 'INT_L', tmp2[, factor(INT_L, levels=c('ART scale up\nfast','ART scale up\nslow','ART scale up\nnone'), labels=c('ART scale up\nfast','ART scale up\nslow','ART scale up\nnone'))])
setnames(tmp2, 'TEAM','team')
tmp3 <- subset(tmp, TEAM=='True')
setnames(tmp3, 'TEAM','team')
ggplot(subset(tmp, TEAM!='True'), aes(y=INT_L, x=central*100, colour=TEAM)) +
geom_point(data=tmp2, size=1, colour='transparent') +
geom_errorbarh(aes(xmin=lower95*100, xmax=upper95*100), height=0.3) + geom_point(size=3, pch=13) +
geom_point(data=tmp3, size=3, colour='black', pch=18) +
scale_colour_manual(values=TEAM_CL) +
facet_grid(AC_L+DATA_T2~TEAM, scales='free', space='free_y') +
theme_bw() + theme(panel.margin.x= unit(1, "lines"), legend.position='bottom') +
guides(colour=guide_legend(ncol=2)) +
labs(x= '%', y='', title='Primary objectives: %Acute at baseline\n(Village)\n')
ggsave(file=paste(outdir,'/res_acrossTEAM_primaryvillage_pcacutebaseline','.pdf',sep=''), w=8, h=7)
ggplot(subset(tmp, TEAM!='True'), aes(y=INT_L, x=central*100, colour=TEAM)) +
geom_point(data=tmp2, size=1, colour='transparent') +
geom_errorbarh(aes(xmin=lower95*100, xmax=upper95*100), height=0.3) + geom_point(size=3, pch=13) +
geom_point(data=tmp3, size=3, colour='black', pch=18) +
coord_cartesian(xlim=c(0,50)) +
scale_colour_manual(values=TEAM_CL) +
facet_grid(AC_L+DATA_T2~TEAM, scales='free_y', space='free_y') +
theme_bw() + theme(panel.margin.x= unit(1, "lines"), legend.position='bottom') +
guides(colour=guide_legend(ncol=2)) +
labs(x= '%', y='', title='Primary objectives: %Acute at baseline\n(Village)\n')
ggsave(file=paste(outdir,'/res_acrossTEAM_primaryvillage_pcacutebaseline_ssc','.pdf',sep=''), w=10, h=7)
# %ACUTE ENDPOINT
tmp <- subset(df, OBJ%in%c('OBJ_vi') & !grepl('(',TEAM,fixed=1) & DATAT_L=='Village' & USED_GENES=='all')
set(tmp, NULL, 'OBJ_L2', tmp[, factor(as.character(OBJ_L2))])
tmp2<- as.data.table(expand.grid(central=c(0.01,0.03), AC_L=c('%Acute\nlow','%Acute\nhigh'), INT_L=c('ART scale up\nfast','ART scale up\nslow','ART scale up\nnone'), TEAM='dummy', DATA_T2='using\ntrue tree', OBJ_L2=c('%Acute\n(endpoint)')))
set(tmp2, NULL, 'INT_L', tmp2[, factor(INT_L, levels=c('ART scale up\nfast','ART scale up\nslow','ART scale up\nnone'), labels=c('ART scale up\nfast','ART scale up\nslow','ART scale up\nnone'))])
setnames(tmp2, 'TEAM','team')
tmp3 <- subset(tmp, TEAM=='True')
setnames(tmp3, 'TEAM','team')
ggplot(subset(tmp, TEAM!='True'), aes(y=INT_L, x=central, colour=TEAM)) +
geom_point(data=tmp2, size=1, colour='transparent') +
geom_errorbarh(aes(xmin=lower95, xmax=upper95), height=0.3) + geom_point(size=3, pch=13) +
geom_point(data=tmp3, size=3, colour='black', pch=18) +
scale_colour_manual(values=TEAM_CL) +
facet_grid(AC_L+DATA_T2~TEAM, scales='free', space='free_y') +
theme_bw() + theme(panel.margin.x= unit(1, "lines"), legend.position='bottom') +
guides(colour=guide_legend(ncol=2)) +
labs(x= '%', y='', title='Primary objectives: %Acute at endpoint\n(Village)\n')
ggsave(file=paste(outdir,'/res_acrossTEAM_primaryvillage_pcacuteend','.pdf',sep=''), w=8, h=7)
ggplot(subset(tmp, TEAM!='True'), aes(y=INT_L, x=central*100, colour=TEAM)) +
geom_point(data=tmp2, size=1, colour='transparent') +
geom_errorbarh(aes(xmin=lower95*100, xmax=upper95*100), height=0.3) + geom_point(size=3, pch=13) +
geom_point(data=tmp3, size=3, colour='black', pch=18) +
scale_colour_manual(values=TEAM_CL) +
facet_grid(AC_L+DATA_T2~TEAM, scales='free_y', space='free_y') +
theme_bw() + theme(panel.margin.x= unit(1, "lines"), legend.position='bottom') +
guides(colour=guide_legend(ncol=2)) +
labs(x= '%', y='', title='Primary objectives: %Acute at endpoint\n(Village)\n')
ggsave(file=paste(outdir,'/res_acrossTEAM_primaryvillage_pcacuteend_ssc','.pdf',sep=''), w=10, h=7)
# OVERALL
tmp <- subset(df, OBJ%in%c('OBJ_ii','OBJ_iii','OBJ_v','OBJ_vi') & !grepl('(',TEAM,fixed=1) & DATAT_L=='Village' & USED_GENES=='all')
tmp2<- as.data.table(expand.grid(central=c(0.01,0.55), AC_L=c('%Acute\nlow','%Acute\nhigh'), INT_L=c('ART scale up\nfast','ART scale up\nslow','ART scale up\nnone'), TEAM='dummy', DATA_T2='using\ntrue tree', OBJ_L2=c('%Acute\n(baseline)','%Acute\n(endpoint)')))
set(tmp2, NULL, 'INT_L', tmp2[, factor(INT_L, levels=c('ART scale up\nfast','ART scale up\nslow','ART scale up\nnone'), labels=c('ART scale up\nfast','ART scale up\nslow','ART scale up\nnone'))])
setnames(tmp2, 'TEAM','team')
tmp3 <- subset(tmp, TEAM=='True')
setnames(tmp3, 'TEAM','team')
ggplot(subset(tmp, TEAM!='True'), aes(y=INT_L, x=central, colour=TEAM)) +
geom_point(data=tmp2, size=1, colour='transparent') +
geom_errorbarh(aes(xmin=lower95, xmax=upper95), height=0.3) + geom_point(size=3, pch=13) +
geom_point(data=tmp3, size=3, colour='black', pch=18) +
scale_colour_manual(values=TEAM_CL) +
facet_grid(TEAM+DATA_T2~OBJ_L2+AC_L, scales='free', space='free_y') +
theme_bw() + theme(panel.margin.x= unit(0.5, "lines"), legend.position='bottom') +
guides(colour=guide_legend(ncol=2)) +
labs(x= 'Estimates', y='', title='Primary objectives: quantitative\n(Village)\n')
ggsave(file=paste(outdir,'/res_acrossTEAM_primaryvillage_nmbrs','.pdf',sep=''), w=13, h=13)
# qualitative
tmp <- subset(df, OBJ%in%c('OBJ_i','OBJ_iv') & !grepl('(',TEAM,fixed=1) & DATAT_L=='Village' & USED_GENES=='all')
set(tmp, NULL, 'central', tmp[, as.character(central)])
set(tmp, tmp[, which(OBJ=='OBJ_i' & central=='-1')], 'central', 'declining')
set(tmp, tmp[, which(OBJ=='OBJ_i' & central=='0')], 'central', 'stable')
set(tmp, tmp[, which(OBJ=='OBJ_i' & central=='1')], 'central', 'increasing')
set(tmp, tmp[, which(OBJ=='OBJ_iv' & central=='-1')], 'central', '<15%')
set(tmp, tmp[, which(OBJ=='OBJ_iv' & central=='0')], 'central', '15%-30%')
set(tmp, tmp[, which(OBJ=='OBJ_iv' & central=='1')], 'central', '>30%')
tmp2 <- c('declining','stable','increasing','<15%','15%-30%','>30%')
set(tmp, NULL, 'central', tmp[, factor(central, levels=tmp2, labels=tmp2)])
tmp2 <- as.data.table(expand.grid(central=tmp2, AC_L=c('%Acute\nlow','%Acute\nhigh'), INT_L=c('ART scale up\nfast','ART scale up\nslow','ART scale up\nnone'), TEAM='dummy', DATA_T2='using\ntrue tree', OBJ_L2='Incidence\nTrend'))
set(tmp2, tmp2[, which(grepl('%', central))], 'OBJ_L2', '%Acute Ctgr\n(baseline)')
set(tmp2, NULL, 'INT_L', tmp2[, factor(INT_L, levels=c('ART scale up\nfast','ART scale up\nslow','ART scale up\nnone'), labels=c('ART scale up\nfast','ART scale up\nslow','ART scale up\nnone'))])
setkey(tmp2, central)
tmp3 <- subset(tmp, TEAM=='True')
setnames(tmp2, 'TEAM','team')
setnames(tmp3, 'TEAM','team')
ggplot(tmp2, aes(y=INT_L, x=central)) +
#geom_errorbarh(height=0.3) +
geom_point(data=tmp2, size=1, colour='transparent') +
geom_jitter(data=subset(tmp, TEAM!='True'), aes(colour=TEAM), size=3, pch=13, position = position_jitter(width=0, height=.15)) +
geom_point(data=tmp3, size=3, colour='black', pch=18) +
scale_colour_manual(values=TEAM_CL) +
facet_grid(TEAM+DATA_T2~OBJ_L2+AC_L, scales='free', space='free_y') +
theme_bw() + theme(panel.margin.x= unit(0.5, "lines"), legend.position='bottom', panel.grid.minor= element_blank(), panel.grid.major= element_blank()) +
guides(colour=guide_legend(ncol=2)) +
labs(x= '\nEstimates', y='', title='Primary objectives: qualitative\n(Village)\n')
ggsave(file=paste(outdir,'/res_acrossTEAM_primaryvillage_qultv','.pdf',sep=''), w=13, h=13)
#invisible(lapply(c('OBJ_ii','OBJ_iii','OBJ_v','OBJ_vi'), function(x)
# {
# tmp <- subset(df, OBJ==x & !grepl('(',TEAM,fixed=1) & DATAT_L=='Village' & USED_GENES=='all')
# if(x=='OBJ_i')
# set(tmp, NULL, 'central', tmp[, factor(central, levels=c(1,0,-1), labels=c('increasing','stable','decreasing'))])
# if(x=='OBJ_iv')
# set(tmp, NULL, 'central', tmp[, factor(central, levels=c(1,0,-1), labels=c('>30%','15%-30%','<15%'))])
# ggplot(subset(tmp, TEAM!='True'), aes(y=DATA_T, x=central, xmin=lower95, xmax=upper95, colour=TEAM)) +
# geom_errorbarh(height=0.3) + geom_point(size=3, pch=13) +
# geom_point(data=subset(tmp, TEAM=='True'), size=3, colour='black', pch=18) +
# scale_colour_manual(values=TEAM_CL) +
# facet_grid(AC_L~INT_L, scales='free', space='free_y') +
# theme_bw() + theme(panel.margin.x= unit(2, "lines")) +
# labs(x= paste('\n',gsub('\n',' ',tmp[1, OBJ_L]),sep=''), y='')
# ggsave(file=paste(outdir,'/res_acrossTEAM_primaryvillage_',x,'.pdf',sep=''), w=10, h=3)
# }))
# SECONDARY: compare oversampling during intervention on regional
df <- subset(dfa, DATA_T=='phy' & SMPL_M=='extreme' & DATAT_L=='Regional')
tmp <- subset(dfa, DATA_T=='phy' & SMPL_M=='much' & SMPL_C=='8%' & SMPL_N==1600 & IMPRT=='5%' & AC_T=='low' & INT_T!='none')
df <- rbind(tmp, df)
tmp <- subset(df, OBJ%in%c('OBJ_ii','OBJ_iii','OBJ_v','OBJ_vi') & !grepl('(',TEAM,fixed=1) & DATAT_L=='Regional' & USED_GENES=='all')
tmp[, SMPL_L:= NA_character_]
set(tmp, tmp[, which(SMPL_M=='much')], 'SMPL_L', 'sampling\n50% during intervention')
set(tmp, tmp[, which(SMPL_M=='extreme')], 'SMPL_L', 'sampling\n85% during intervention')
set(tmp, NULL, 'SMPL_L', tmp[, factor(SMPL_L, levels=c('sampling\n85% during intervention','sampling\n50% during intervention'), labels=c('sampling\n85% during intervention','sampling\n50% during intervention'))])
ggplot(subset(tmp, TEAM!='True'), aes(y=SMPL_L, x=central, xmin=lower95, xmax=upper95, colour=TEAM)) +
geom_errorbarh(height=0.3) + geom_point(size=3, pch=13) +
geom_point(data=subset(tmp, TEAM=='True'), size=3, colour='black', pch=18) +
scale_colour_manual(values=TEAM_CL) +
facet_grid(INT_L~OBJ_L2+AC_L, scales='free', space='free_y') +
theme_bw() + theme(panel.margin.x= unit(0.5, "lines"), legend.position='bottom') +
guides(colour=guide_legend(ncol=2)) +
labs(x= 'Estimates', y='', title='Secondary objective: oversampling during intervention\n(Regional, using true tree)\n')
ggsave(file=paste(outdir,'/res_acrossTEAM_secondary_oversmplintrvntnregional','.pdf',sep=''), w=13, h=5.5)
# SECONDARY: compare imports high / low
df <- subset(dfa, SC_RND%in%c('P','E','L','H','12','10'))
tmp <- subset(df, OBJ%in%c('OBJ_ii','OBJ_iii','OBJ_v','OBJ_vi') & !grepl('(',TEAM,fixed=1) & USED_GENES=='all')
tmp[, IMPRT_L:= NA_character_]
set(tmp, tmp[, which(IMPRT=='0%')], 'IMPRT_L', '0% trns/year from outside\n(Village)')
set(tmp, tmp[, which(IMPRT=='5%')], 'IMPRT_L', '5% trns/year from outside\n(Regional)')
set(tmp, tmp[, which(IMPRT=='20%')], 'IMPRT_L', '20% trns/year from outside\n(Regional)')
set(tmp, NULL, 'IMPRT_L', tmp[, factor(IMPRT_L, levels=rev(c('0% trns/year from outside\n(Village)','5% trns/year from outside\n(Regional)','20% trns/year from outside\n(Regional)')), labels=rev(c('0% trns/year from outside\n(Village)','5% trns/year from outside\n(Regional)','20% trns/year from outside\n(Regional)')))])
set(tmp, NULL, 'INT_L', tmp[, factor(as.character(INT_L), levels=c('ART scale up\nfast','ART scale up\nslow'), labels=c('ART scale up\nfast','ART scale up\nslow'))])
tmp3 <- subset(tmp, TEAM=='True')
setnames(tmp3, 'TEAM','team')
tmp2 <- as.data.table(expand.grid( central=0.1, IMPRT_L=rev(c('0% trns/year from outside\n(Village)','5% trns/year from outside\n(Regional)','20% trns/year from outside\n(Regional)')),
AC_L='%Acute\nhigh', INT_L=c('ART scale up\nfast','ART scale up\nslow'), TEAM=c('Imperial', 'Vancouver', 'Cambridge/Imperial', 'ETH Zurich'), OBJ_L2='%Incidence'))
ggplot(tmp2, aes(y=IMPRT_L, x=central, colour=TEAM)) +
geom_point(size=1, colour='transparent') +
geom_errorbarh(data=subset(tmp, TEAM!='True'), aes(xmin=lower95, xmax=upper95), height=0.3) +
geom_point(data=subset(tmp, TEAM!='True'), size=3, pch=13) +
geom_point(data=tmp3, size=3, colour='black', pch=18) +
scale_colour_manual(values=TEAM_CL) +
facet_grid(TEAM+INT_L~OBJ_L2+AC_L, scales='free', space='free_y') +
theme_bw() + theme(panel.margin.x= unit(0.5, "lines"), legend.position='bottom') +
guides(colour=guide_legend(ncol=2)) +
labs(x= '\nEstimates', y='', title='Secondary objective: transmissions from outside\n(using true tree)\n')
ggsave(file=paste(outdir,'/res_acrossTEAM_secondary_cntm','.pdf',sep=''), w=13, h=12)
# SECONDARY: compare duration sampling
df <- subset(dfa, SC_RND%in%c('O','F','T','L'))
tmp <- subset(df, OBJ%in%c('OBJ_ii','OBJ_iii','OBJ_v','OBJ_vi') & !grepl('(',TEAM,fixed=1) & USED_GENES=='all')
tmp[, SMPL_L:= NA_character_]
set(tmp, tmp[, which(SMPL_D=='3')], 'SMPL_L', '3 yr sampling duration\nafter intervention start')
set(tmp, tmp[, which(SMPL_D=='5')], 'SMPL_L', '5 yr sampling duration\nafter intervention start')
set(tmp, NULL, 'SMPL_L', tmp[, factor(SMPL_L, levels=rev(c('3 yr sampling duration\nafter intervention start','5 yr sampling duration\nafter intervention start')), labels=rev(c('3 yr sampling duration\nafter intervention start','5 yr sampling duration\nafter intervention start')))])
ggplot(subset(tmp, TEAM!='True'), aes(y=SMPL_L, x=central, xmin=lower95, xmax=upper95, colour=TEAM)) +
geom_errorbarh(height=0.3) + geom_point(size=3, pch=13) +
geom_point(data=subset(tmp, TEAM=='True'), size=3, colour='black', pch=18) +
scale_colour_manual(values=TEAM_CL) +
facet_grid(INT_L~OBJ_L2+AC_L, scales='free', space='free_y') +
theme_bw() + theme(panel.margin.x= unit(0.5, "lines"), legend.position='bottom') +
guides(colour=guide_legend(ncol=2)) +
labs(x= 'Estimates', y='', title='Secondary objective: sampling duration after intervention start\n(Regional, using true tree)\n')
ggsave(file=paste(outdir,'/res_acrossTEAM_secondary_sdurregional','.pdf',sep=''), w=13, h=4)
# SECONDARY: compare seq coverage
df <- subset(dfa, SC_RND%in%c('I','J','G','K','T','R','L','H','05','08','06','07','11','09','12','10'))
tmp <- subset(df, OBJ%in%c('OBJ_ii','OBJ_iii','OBJ_v','OBJ_vi') & !grepl('(',TEAM,fixed=1) & USED_GENES=='all')
tmp[, SMPL_L:= NA_character_]
set(tmp, tmp[, which(SMPL_C=='8%')], 'SMPL_L', '8% coverage (Regional)')
set(tmp, tmp[, which(SMPL_C=='16%')], 'SMPL_L', '16% coverage (Regional)')
set(tmp, tmp[, which(SMPL_C=='30%')], 'SMPL_L', '30% coverage (Village)')
set(tmp, tmp[, which(SMPL_C=='60%')], 'SMPL_L', '60% coverage (Village)')
tmp2 <- c('8% coverage (Regional)','16% coverage (Regional)', '30% coverage (Village)', '60% coverage (Village)')
set(tmp, NULL, 'SMPL_L', tmp[, factor(SMPL_L, levels=rev(tmp2), labels=rev(tmp2))])
set(tmp, NULL, 'INT_L', tmp[, factor(as.character(INT_L), levels=c('ART scale up\nfast','ART scale up\nslow'), labels=c('ART scale up\nfast','ART scale up\nslow'))])
tmp3 <- subset(tmp, TEAM=='True')
setnames(tmp3, 'TEAM','team')
ggplot(subset(tmp, TEAM!='True'), aes(y=SMPL_L, x=central, xmin=lower95, xmax=upper95, colour=TEAM)) +
geom_errorbarh(height=0.3) + geom_point(size=3, pch=13) +
geom_point(data=tmp3, size=3, colour='black', pch=18) +
scale_colour_manual(values=TEAM_CL) +
facet_grid(TEAM+INT_L~OBJ_L2+AC_L, scales='free', space='free_y') +
theme_bw() + theme(panel.margin.x= unit(0.5, "lines"), legend.position='bottom') +
guides(colour=guide_legend(ncol=2)) +
labs(x= 'Estimates', y='', title='Secondary objective: sampling coverage\n(using true tree)\n')
ggsave(file=paste(outdir,'/res_acrossTEAM_secondary_scvrg','.pdf',sep=''), w=13, h=10)
tmp <- subset(tmp, !(OBJ=='OBJ_ii' & central>0.1) & !(OBJ=='OBJ_v' & central>0.3 & AC_T=='low') & !(OBJ=='OBJ_vi' & central>0.3 & AC_T=='low'))
tmp2 <- as.data.table(expand.grid( central=0.1, SMPL_L=rev(c('8% coverage (Regional)','16% coverage (Regional)', '30% coverage (Village)', '60% coverage (Village)')),
AC_L='%Acute\nlow', INT_L=c('ART scale up\nfast','ART scale up\nslow'), TEAM=c('Imperial', 'Vancouver', 'Cambridge/Imperial', 'ETH Zurich'), OBJ_L2='%Incidence'))
ggplot(tmp2, aes(y=SMPL_L, x=central, colour=TEAM)) +
geom_point(size=1, colour='transparent') +
geom_point(data=subset(tmp, TEAM!='True'), size=3, pch=13) + geom_errorbarh(data=subset(tmp, TEAM!='True'), aes(xmin=lower95, xmax=upper95), height=0.3) +
geom_point(data=tmp3, size=3, colour='black', pch=18) +
scale_colour_manual(values=TEAM_CL) +
facet_grid(TEAM+INT_L~OBJ_L2+AC_L, scales='free', space='free_y') +
theme_bw() + theme(panel.margin.x= unit(0.5, "lines"), legend.position='bottom') +
guides(colour=guide_legend(ncol=2)) +
labs(x= '\nEstimates', y='', title='Secondary objective: sampling coverage\n(using true tree, range cut)\n')
ggsave(file=paste(outdir,'/res_acrossTEAM_secondary_scvrgcut','.pdf',sep=''), w=13, h=11)
dfi<- subset( dfa, select=c(SC_RND, DATAT_L, DATA_T, AC_T, INT_T, IMPRT, SMPL_N, SMPL_C, SMPL_M, SMPL_D) )
setkey(dfi, DATAT_L, DATA_T, AC_T, INT_T, IMPRT, SMPL_N, SMPL_C, SMPL_M, SMPL_D)
dfi <- unique(dfi)
file<- paste(outdir,'/../docs/SC_RND_info','.csv',sep='')
write.csv(dfi, file=file, row.names=FALSE)
tmp <- subset(dfa, select=c(TEAM, DATA_T, DATAT_L, SIM_SCENARIO))
setkey(tmp, TEAM, DATA_T, DATAT_L, SIM_SCENARIO)
tmp <- unique(tmp)
tmp[, table(TEAM, DATA_T, DATAT_L )]
}
##--------------------------------------------------------------------------------------------------------
## evaluate results
## olli 08.05.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.pipeline.Feb2015.evaluate.read<- function()
{
# read truth for regional simus
indir <- '/Users/Oliver/Dropbox\ (Infectious Disease)/PANGEAHIVsim_internal/documents/external/2015_05_results'
file <- paste(indir, '/answers_Regional_Feb2015_rFormat.csv', sep='')
df <- read.submission(file, verbose=0, reset.OBJiv.conservative=1)
# read truth for village simus
file <- paste(indir, '/answers_Village_Feb2015-yr43_rFormat.csv', sep='')
tmp <- read.submission(file, verbose=0, reset.OBJiv.conservative=1)
set(tmp, NULL, 'TEAM', 'True')
df <- rbind(df, tmp)
# read submissions
tmp <- list.files(indir, pattern='csv$')
tmp <- tmp[!grepl('answers',tmp)]
# read Eriks multiple submissions
tmp2 <- data.table(FILE=tmp[grepl('cambImp',tmp)])
tmp2[, RUN:= tmp2[, sapply( strsplit(FILE,'_'), function(x) rev(x)[1] )]]
set(tmp2, NULL, 'RUN', tmp2[, substr(RUN, 1, nchar(RUN)-4)])
set(tmp2, NULL, 'RUN', tmp2[, gsub('results0','',RUN)])
dfs <- do.call('rbind',lapply(seq_len(nrow(tmp2)), function(i)
{
z <- read.submission( paste(indir, '/', tmp2[i, FILE], sep=''), verbose=0, reset.OBJiv.conservative=1 )
set(z, NULL, 'TEAM', z[, paste(TEAM, ' (', tmp2[i, RUN], ')', sep='')])
z
}))
tmp <- tmp[!grepl('cambImp',tmp)]
tmp <- do.call('rbind',lapply(tmp, function(x) read.submission(paste(indir,'/',x,sep=''), verbose=0, reset.OBJiv.conservative=1)))
dfs <- rbind(dfs, tmp)
# change team name
set(dfs, dfs[, which(TEAM=='Colijn')],'TEAM','Imperial')
# construct Erik's gold submission
# for regional tree, use mergedTab
tmp <- subset(dfs, grepl('merged', TEAM) & grepl('Regional',SIM_SCENARIO))
tmp[, TEAM:='Cambridge/Imperial']
#tmp <- subset(dfs, grepl('mh30', TEAM) & grepl('Regional',SIM_SCENARIO))
#tmp[, TEAM:='Cambridge/Imperial']
#tmp2 <- subset(dfs, grepl('mh15', TEAM) & grepl('Regional',SIM_SCENARIO))
#tmp2[, TEAM:='Cambridge/Imperial']
#tmp <- merge(tmp, tmp2, by=c('TEAM','SUBMISSION_DATE','SIM_SCENARIO','USED_GENES','OBJ'), all=1)
#tmp2 <- tmp[, which(is.na(central.x))]
#set(tmp, tmp2, 'central.x', tmp[tmp2, central.y])
#set(tmp, tmp2, 'lower95.x', tmp[tmp2, lower95.y])
#set(tmp, tmp2, 'upper95.x', tmp[tmp2, upper95.y])
#setnames(tmp, c('central.x', 'lower95.x', 'upper95.x'), c('central', 'lower95', 'upper95'))
#set(tmp, NULL, c('central.y', 'lower95.y', 'upper95.y'), NULL)
dfs <- rbind(dfs, tmp)
# for village tree, use mh30 where available and mh15 where mh30 not available
tmp <- subset(dfs, grepl('mh30', TEAM) & grepl('Vill',SIM_SCENARIO))
tmp[, TEAM:='Cambridge/Imperial']
tmp2 <- subset(dfs, grepl('mh15', TEAM) & grepl('Vill',SIM_SCENARIO))
tmp2[, TEAM:='Cambridge/Imperial']
tmp <- merge(tmp, tmp2, by=c('TEAM','SUBMISSION_DATE','SIM_SCENARIO','USED_GENES','OBJ'), all=1)
tmp2 <- tmp[, which(is.na(central.x))]
set(tmp, tmp2, 'central.x', tmp[tmp2, central.y])
set(tmp, tmp2, 'lower95.x', tmp[tmp2, lower95.y])
set(tmp, tmp2, 'upper95.x', tmp[tmp2, upper95.y])
setnames(tmp, c('central.x', 'lower95.x', 'upper95.x'), c('central', 'lower95', 'upper95'))
set(tmp, NULL, c('central.y', 'lower95.y', 'upper95.y'), NULL)
dfs <- rbind(dfs, tmp)
# for village seq, use LSD
tmp <- subset(dfs, grepl('lsd', TEAM) & grepl('Vill',SIM_SCENARIO))
tmp[, TEAM:='Cambridge/Imperial']
dfs <- rbind(dfs, tmp)
# define data types (seq or phylo)
dfa <- rbind(dfs, df)
dfa[, DATA_T:=NA_character_]
set(dfa, dfa[, which(grepl('Vill_0[1-4]', SIM_SCENARIO))], 'DATA_T', 'seq')
set(dfa, dfa[, which(!grepl('Vill_0[1-4]', SIM_SCENARIO))], 'DATA_T', 'phy')
set(dfa, dfa[, which(grepl('FirstObj', SIM_SCENARIO))], 'DATA_T', 'seq')
set(dfa, dfa[, which(grepl('SecondObj', SIM_SCENARIO))], 'DATA_T', 'phy')
stopifnot(!any(is.na(dfa[, DATA_T])))
# define randomized scenario IDs
dfa[, SC_RND:=NA_character_]
tmp <- dfa[, which(grepl('Regional',SIM_SCENARIO))]
set(dfa, tmp, 'SC_RND', dfa[tmp, substring(regmatches(SIM_SCENARIO,regexpr('sc[A-Z]',SIM_SCENARIO)),3)])
tmp <- dfa[, which(grepl('Vill',SIM_SCENARIO))]
set(dfa, tmp, 'SC_RND', dfa[tmp, substring(regmatches(SIM_SCENARIO,regexpr('Vill_[0-9]+',SIM_SCENARIO)),6)])
stopifnot(!any(is.na(dfa[, SC_RND])))
# describe regional simulations in terms of fast/low intervention high/low acute
set.seed(42)
dfi <- data.table(FILE=list.files('/Users/Oliver/duke/2014_Gates/methods_comparison_pipeline/FINAL', '.*zip$', full.names=FALSE))
dfi[, SC:= sapply(strsplit(FILE, '_'),'[[',3)]
dfi[, CONFIG:= sapply(strsplit(SC, '-'),'[[',2)]
set(dfi, NULL, 'SC', dfi[, sapply(strsplit(SC, '-'),'[[',1)])
dfi[, DATAT:= sapply(strsplit(FILE, '_'),'[[',5)]
set(dfi, NULL, 'DATAT', dfi[, gsub('.zip','',DATAT,fixed=T)])
set(dfi, NULL, 'OBJECTIVE', 'SecondObj')
set(dfi, dfi[,which(CONFIG=='sq')],'OBJECTIVE', 'FirstObj')
dfi <- merge(dfi,dfi[, list(FILE=FILE, DUMMY=sample(length(FILE),length(FILE))), by='OBJECTIVE'],by=c('OBJECTIVE','FILE'))
tmp <- dfi[, which(OBJECTIVE=='SecondObj')]
set(dfi, tmp, 'DUMMY', dfi[tmp, DUMMY] + dfi[OBJECTIVE=='FirstObj', max(DUMMY)])
setkey(dfi, DUMMY)
dfi[, SC_RND:= toupper(letters[seq_len(nrow(dfi))])]
dfi <- subset(dfi, select=c(SC, SC_RND, CONFIG))
set(dfi, NULL, 'SC', dfi[, substring(SC, 3)])
dfi <- merge( dfi, data.table(SC= c('A','B','C','D','E','F'), AC_T=c('low','high','low','high','low','high'), INT_T=c('fast','fast','slow','slow','none','none')), by='SC' )
tmp <- data.table( CONFIG= c('sq','s2x','y3','mFP85','ph','tr20'),
IMPRT= c(.05, .05, .05, .05, .05, .2),
SMPL_N= c(1600, 3200, 1280, 1600, 1600, 1600),
SMPL_C= c(0.08, 0.16, 0.08, 0.08, 0.08, 0.08),
SMPL_M= c('overs', 'overs', 'overs', 'extrs', 'overs', 'overs'),
SMPL_D= c(5, 5, 3, 5, 5, 5))
dfi <- merge( dfi, tmp, by='CONFIG')
set(dfi, NULL, c('CONFIG'), NULL)
# add info for village
tmp <- data.table( SC_RND= c('03','02','01','04','05','08','06','07','11','09','12','10','00'),
AC_T= c('low','low','high','high','low','low','high','high','low','low','high','high','low'),
INT_T= c('fast','slow','fast','slow','fast','slow','fast','slow','fast','slow','fast','slow','none'),
#SMPL_C= c(0.25, 0.25, 0.25, 0.25, 0.5, 0.5, 0.5, 0.5, 0.25, 0.25, 0.25, 0.25, 0.25),
SMPL_C= c(0.3, 0.3, 0.3, 0.3, 0.6, 0.6, 0.6, 0.6, 0.3, 0.3, 0.3, 0.3, 0.3),
SMPL_D= 5,
SMPL_N= c(777, 857, 957, 1040, 1469, 1630, 1831, 1996, 638, 686, 956, 1012, 872),
IMPRT= c(0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0, 0, 0, 0, 0.02))
dfi <- rbind(dfi, tmp, fill=TRUE,use.names=TRUE)
# merge to dfa
cat(paste('\nnumber of rows before merge with dfi, n=', nrow(dfa)))
dfa <- merge(dfa, dfi, by='SC_RND')
cat(paste('\nnumber of rows before merge with dfi, n=', nrow(dfa)))
dfa
}
##--------------------------------------------------------------------------------------------------------
## check simulated sequences: create ExaML tree and estimate R2
## olli 09.02.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.pipeline.Feb2015.randomize<- function()
{
set.seed(42)
indir <- '/Users/Oliver/duke/2014_Gates/methods_comparison_pipeline/FINAL'
outdir <- '/Users/Oliver/duke/2014_Gates/methods_comparison_pipeline/FINAL_RND'
dfi <- data.table(FILE=list.files(indir, '.*zip$', full.names=FALSE))
dfi[, SC:= sapply(strsplit(FILE, '_'),'[[',3)]
dfi[, CONFIG:= sapply(strsplit(SC, '-'),'[[',2)]
set(dfi, NULL, 'SC', dfi[, sapply(strsplit(SC, '-'),'[[',1)])
dfi[, DATAT:= sapply(strsplit(FILE, '_'),'[[',5)]
set(dfi, NULL, 'DATAT', dfi[, gsub('.zip','',DATAT,fixed=T)])
set(dfi, NULL, 'OBJECTIVE', 'SecondObj')
set(dfi, dfi[,which(CONFIG=='sq')],'OBJECTIVE', 'FirstObj')
dfi <- merge(dfi,dfi[, list(FILE=FILE, DUMMY=sample(length(FILE),length(FILE))), by='OBJECTIVE'],by=c('OBJECTIVE','FILE'))
tmp <- dfi[, which(OBJECTIVE=='SecondObj')]
set(dfi, tmp, 'DUMMY', dfi[tmp, DUMMY] + dfi[OBJECTIVE=='FirstObj', max(DUMMY)])
setkey(dfi, DUMMY)
dfi[, SC_RND:= toupper(letters[seq_len(nrow(dfi))])]
dfi[, GSUB_FROM:= sapply(strsplit(FILE, '_'),'[[',3)]
dfi[, GSUB_TO:= paste(OBJECTIVE,'_sc',SC_RND,sep='')]
tmp <- dfi[, {
cat(paste('process file',FILE))
tmpdir <- paste(indir,'/tmp',SC_RND,sep='')
unzip( paste(indir,'/',FILE,sep=''), exdir=tmpdir)
files.name.o <- list.files( paste(indir,'/tmp',SC_RND,sep='') )
files.name.n <- gsub(GSUB_FROM,GSUB_TO,files.name.o)
files.name.n <- gsub('HPTN071','PANGEAsim_Regional',files.name.n)
file.rename( paste(tmpdir, files.name.o, sep='/'), paste(tmpdir, files.name.n, sep='/'))
newzip <- gsub(GSUB_FROM,GSUB_TO,FILE)
newzip <- gsub('HPTN071','PANGEAsim_Regional',newzip)
zip( paste(outdir,'/',newzip,sep=''), paste(tmpdir, files.name.n, sep='/'), flags = "-Fr9XTj")
file.remove( paste(tmpdir, files.name.n, sep='/'), showWarnings=FALSE)
file.remove( tmpdir, showWarnings=FALSE )
}, by='DUMMY']
}
##--------------------------------------------------------------------------------------------------------
## check simulated sequences: create ExaML tree and estimate R2
## olli 23.10.14
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.pipeline.November2014<- function()
{
# with CD4
if(0)
{
indir <- system.file(package="PANGEA.HIV.sim", "misc")
pipeline.args <- sim.regional.args( yr.start=1980, yr.end=2020, seed=42,
s.INC.recent=0.1, s.INC.recent.len=5, s.PREV.min=0.01, s.PREV.max=0,
epi.model='HPTN071', epi.dt=1/48, epi.import=0.05,
v.N0tau=1, v.r=2.851904, v.T50=-2,
wher.mu=log(0.00447743)-0.3^2/2, wher.sigma=0.3, bwerm.mu=log(0.002239075)-0.13^2/2, bwerm.sigma=0.13, er.gamma=NA,
dbg.GTRparam=0, dbg.rER=0, index.starttime.mode='normal', startseq.mode='sample', seqtime.mode=NA)
# standard run, fixed GTR param + fixed relative rates for each transmission chain
# no WH Er to equal BH ER, and both being constant
pipeline.vary <- data.table( seqtime.mode=c('Gamma9','Gamma3','Unif12'),
label=c('-n0005g9','-n0005g3','-n0005u'))
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='seqtime.mode' ), 'v', as.character(seqtime.mode))
print(pipeline.args)
# scenario A
infile.ind <- '211014_RUN123_SCENARIO_0'
infile.trm <- '211014_RUN123_SCENARIO_0'
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp141024'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
set(pipeline.args, which( pipeline.args$stat=='s.PREV.max' ), 'v', '0.11')
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
system(file)
}, by='label']
}
# with CD4, lower WH rate
if(1)
{
indir <- system.file(package="PANGEA.HIV.sim", "misc")
pipeline.args <- sim.regional.args( yr.start=1980, yr.end=2020, seed=42,
s.INC.recent=0.1, s.INC.recent.len=5, s.PREV.min=0.01, s.PREV.max=0,
epi.model='HPTN071', epi.dt=1/48, epi.import=0.20,
v.N0tau=1, v.r=2.851904, v.T50=-2,
wher.mu=log(0.003358613)-0.3^2/2, wher.sigma=0.3, bwerm.mu=log(0.002239075)-0.13^2/2, bwerm.sigma=0.13, er.gamma=NA,
dbg.GTRparam=0, dbg.rER=0, index.starttime.mode='fix', startseq.mode='sample', seqtime.mode=NA)
# standard run, fixed GTR param + fixed relative rates for each transmission chain
# no WH Er to equal BH ER, and both being constant
pipeline.vary <- data.table( seqtime.mode=c('Unif12'),
label=c('-n0005ul20'))
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='seqtime.mode' ), 'v', as.character(seqtime.mode))
print(pipeline.args)
if(1)
{
# scenario A
infile.ind <- '271014_HPTN071_scA_rep1'
infile.trm <- '271014_HPTN071_scA_rep1'
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp141025a'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
set(pipeline.args, which( pipeline.args$stat=='s.PREV.max' ), 'v', '0.11')
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
system(file)
}
}, by='label']
}
# with CD4, lower WH rate
if(1)
{
indir <- system.file(package="PANGEA.HIV.sim", "misc")
pipeline.args <- sim.regional.args( yr.start=1980, yr.end=2020, seed=42,
s.INC.recent=0.1, s.INC.recent.len=5, s.PREV.min=0.01, s.PREV.max=0,
epi.model='HPTN071', epi.dt=1/48, epi.import=0.05,
v.N0tau=1, v.r=2.851904, v.T50=-2,
wher.mu=log(0.003358613)-0.3^2/2, wher.sigma=0.3, bwerm.mu=log(0.002239075)-0.13^2/2, bwerm.sigma=0.13, er.gamma=NA,
dbg.GTRparam=0, dbg.rER=0, index.starttime.mode='normal', startseq.mode='sample', seqtime.mode=NA)
# standard run, fixed GTR param + fixed relative rates for each transmission chain
# no WH Er to equal BH ER, and both being constant
pipeline.vary <- data.table( seqtime.mode=c('Gamma9','Gamma3','Unif12'),
label=c('-n0005g9l','-n0005g3l','-n0005ul'))
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='seqtime.mode' ), 'v', as.character(seqtime.mode))
print(pipeline.args)
if(1)
{
# scenario A
infile.ind <- '271014_HPTN071_scA_rep1'
infile.trm <- '271014_HPTN071_scA_rep1'
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp141025a'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
set(pipeline.args, which( pipeline.args$stat=='s.PREV.max' ), 'v', '0.11')
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
system(file)
}
if(0)
{
# scenario B
infile.ind <- '271014_HPTN071_scB_rep1'
infile.trm <- '271014_HPTN071_scB_rep1'
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp141025b'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
set(pipeline.args, which( pipeline.args$stat=='s.PREV.max' ), 'v', '0.15')
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
system(file)
}
if(0)
{
# scenario C
infile.ind <- '271014_HPTN071_scC_rep1'
infile.trm <- '271014_HPTN071_scC_rep1'
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp141025c'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
set(pipeline.args, which( pipeline.args$stat=='s.PREV.max' ), 'v', '0.185')
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
system(file)
}
}, by='label']
}
# with CD4, no introductions after baseline for Leventhal et al
if(1)
{
indir <- system.file(package="PANGEA.HIV.sim", "misc")
pipeline.args <- sim.regional.args( yr.start=1980, yr.end=2020, seed=42,
s.INC.recent=0.1, s.INC.recent.len=5, s.PREV.min=0.01, s.PREV.max=0,
epi.model='HPTN071', epi.dt=1/48, epi.import=0,
v.N0tau=1, v.r=2.851904, v.T50=-2,
wher.mu=log(0.003358613)-0.3^2/2, wher.sigma=0.3, bwerm.mu=log(0.002239075)-0.13^2/2, bwerm.sigma=0.13, er.gamma=NA,
dbg.GTRparam=0, dbg.rER=0, index.starttime.mode='normal', startseq.mode='sample', seqtime.mode=NA)
# standard run, fixed GTR param + fixed relative rates for each transmission chain
# no WH Er to equal BH ER, and both being constant
pipeline.vary <- data.table( seqtime.mode=c('Unif12'),
label=c('-Leventhal00ul'))
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='seqtime.mode' ), 'v', as.character(seqtime.mode))
print(pipeline.args)
if(1)
{
# scenario A
infile.ind <- '271014_HPTN071_scA_rep1'
infile.trm <- '271014_HPTN071_scA_rep1'
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp141114a'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
set(pipeline.args, which( pipeline.args$stat=='s.PREV.max' ), 'v', '0.11')
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
#system(file)
}
if(1)
{
# scenario B
infile.ind <- '271014_HPTN071_scB_rep1'
infile.trm <- '271014_HPTN071_scB_rep1'
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp141114b'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
set(pipeline.args, which( pipeline.args$stat=='s.PREV.max' ), 'v', '0.15')
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
#system(file)
}
if(1)
{
# scenario C
infile.ind <- '271014_HPTN071_scC_rep1'
infile.trm <- '271014_HPTN071_scC_rep1'
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp141114c'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
set(pipeline.args, which( pipeline.args$stat=='s.PREV.max' ), 'v', '0.185')
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
#system(file)
}
}, by='label']
}
# with CD4, same WH rate for Suchard et al
if(1)
{
indir <- system.file(package="PANGEA.HIV.sim", "misc")
pipeline.args <- sim.regional.args( yr.start=1980, yr.end=2020, seed=42,
s.INC.recent=0.1, s.INC.recent.len=5, s.PREV.min=0.01, s.PREV.max=0,
epi.model='HPTN071', epi.dt=1/48, epi.import=0.05,
v.N0tau=1, v.r=2.851904, v.T50=-2,
wher.mu=log(0.002239075)-0.13^2/2, wher.sigma=0.13, bwerm.mu=log(0.002239075)-0.13^2/2, bwerm.sigma=0.13, er.gamma=NA,
dbg.GTRparam=1, dbg.rER=1, index.starttime.mode='normal', startseq.mode='sample', seqtime.mode=NA)
# standard run, fixed GTR param + fixed relative rates for each transmission chain
# no WH Er to equal BH ER, and both being constant
pipeline.vary <- data.table( seqtime.mode=c('Unif12'),
label=c('-Suchard05ul'))
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='seqtime.mode' ), 'v', as.character(seqtime.mode))
print(pipeline.args)
if(1)
{
# scenario A
infile.ind <- '271014_HPTN071_scA_rep1'
infile.trm <- '271014_HPTN071_scA_rep1'
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp141114a'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
set(pipeline.args, which( pipeline.args$stat=='s.PREV.max' ), 'v', '0.11')
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
#system(file)
}
if(1)
{
# scenario B
infile.ind <- '271014_HPTN071_scB_rep1'
infile.trm <- '271014_HPTN071_scB_rep1'
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp141114b'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
set(pipeline.args, which( pipeline.args$stat=='s.PREV.max' ), 'v', '0.15')
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
#system(file)
}
if(1)
{
# scenario C
infile.ind <- '271014_HPTN071_scC_rep1'
infile.trm <- '271014_HPTN071_scC_rep1'
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp141114c'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
set(pipeline.args, which( pipeline.args$stat=='s.PREV.max' ), 'v', '0.185')
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
#system(file)
}
}, by='label']
}
# all replicates
if(1)
{
indir <- '/Users/Oliver/duke/2014_Gates/methods_comparison_pipeline/Pangea_Oct28_2014_sim'
#indir <- '/Users/Oliver/git/Pangea__Oct27_2014_code'
pipeline.args <- sim.regional.args( yr.start=1980, yr.end=2020, seed=42,
s.INC.recent=0.1, s.INC.recent.len=5, s.PREV.min=0.01, s.PREV.max=0,
epi.model='HPTN071', epi.dt=1/48, epi.import=0.05,
v.N0tau=1, v.r=2.851904, v.T50=-2,
wher.mu=log(0.003358613)-0.3^2/2, wher.sigma=0.3, bwerm.mu=log(0.002239075)-0.13^2/2, bwerm.sigma=0.13, er.gamma=0,
dbg.GTRparam=0, dbg.rER=0, index.starttime.mode='normal', startseq.mode='sample', seqtime.mode='Unif12')
pipeline.vary <- data.table( REP=1:100, label=paste('-r',1:100,sep='') )
dummy <- pipeline.vary[, {
if(1)
{
# scenario A
infile.ind <- paste('281014_HPTN071_scA_rep',REP,sep='')
infile.trm <- paste('281014_HPTN071_scA_rep',REP,sep='')
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp141026a'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
set(pipeline.args, which( pipeline.args$stat=='s.PREV.max' ), 'v', '0.11')
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,'_IND.csv',sep=''), paste(infile.trm,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
system(file)
}
if(1)
{
# scenario B
infile.ind <- paste('281014_HPTN071_scB_rep',REP,sep='')
infile.trm <- paste('281014_HPTN071_scB_rep',REP,sep='')
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp141026b'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
set(pipeline.args, which( pipeline.args$stat=='s.PREV.max' ), 'v', '0.15')
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,'_IND.csv',sep=''), paste(infile.trm,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
system(file)
}
if(1)
{
# scenario C
infile.ind <- paste('281014_HPTN071_scC_rep',REP,sep='')
infile.trm <- paste('281014_HPTN071_scC_rep',REP,sep='')
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp141026c'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
set(pipeline.args, which( pipeline.args$stat=='s.PREV.max' ), 'v', '0.185')
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,'_IND.csv',sep=''), paste(infile.trm,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
system(file)
}
}, by='label']
}
}
##--------------------------------------------------------------------------------------------------------
## check simulated sequences: create ExaML tree and estimate R2
## olli 14.09.14
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.pipeline.October2014<- function()
{
if(0)
{
indir <- system.file(package="PANGEA.HIV.sim", "misc")
infile.ind <- '140716_RUN001'
infile.trm <- '140716_RUN001'
pipeline.args <- sim.regional.args( yr.start=1980, yr.end=2020, seed=42,
s.INC.recent=0.1, s.INC.recent.len=5, s.PREV.min=0.01, s.PREV.max=0.2,
epi.model='HPTN071', epi.dt=1/48, epi.import=0.1,
v.N0tau=3.58e4, v.r=2, v.T50=-1,
wher.mu=NA, wher.sigma=NA,
bwerm.mu=1.5, bwerm.sigma=0.12 )
pipeline.vary <- data.table(wher.mu=c(0.005, 0.004, 0.003), wher.sigma=c(0.8, 0.7, 0.6), label=c('-5','-4','-3'))
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='wher.mu' ), 'v', as.character(wher.mu))
set(pipeline.args, which( pipeline.args$stat=='wher.sigma' ), 'v', as.character(wher.sigma))
print(pipeline.args)
# re-name the following:
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp140914'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
#
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
system(file)
}, by='label']
}
if(0)
{
indir <- system.file(package="PANGEA.HIV.sim", "misc")
infile.ind <- '140716_RUN001'
infile.trm <- '140716_RUN001'
pipeline.args <- sim.regional.args( yr.start=1980, yr.end=2020, seed=42,
s.INC.recent=0.1, s.INC.recent.len=5, s.PREV.min=0.01, s.PREV.max=0.2,
epi.model='HPTN071', epi.dt=1/48, epi.import=0.1,
v.N0tau=3.58e4, v.r=2, v.T50=-1,
wher.mu=0.004, wher.sigma=0.7,
bwerm.mu=NA, bwerm.sigma=NA )
pipeline.vary <- data.table(bwerm.mu=c(1.5, 1.75, 2.0), bwerm.sigma=c(0.12, 0.103, 0.09), label=c('-b15','-b17','-b20'))
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='bwerm.mu' ), 'v', as.character(bwerm.mu))
set(pipeline.args, which( pipeline.args$stat=='bwerm.sigma' ), 'v', as.character(bwerm.sigma))
print(pipeline.args)
# re-name the following:
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp140915'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
#
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
system(file)
}, by='label']
}
if(0)
{
indir <- system.file(package="PANGEA.HIV.sim", "misc")
infile.ind <- '140716_RUN001'
infile.trm <- '140716_RUN001'
pipeline.args <- sim.regional.args( yr.start=1980, yr.end=2020, seed=42,
s.INC.recent=0.1, s.INC.recent.len=5, s.PREV.min=0.01, s.PREV.max=0.2,
epi.model='HPTN071', epi.dt=1/48, epi.import=0.1,
v.N0tau=1, v.r=NA, v.T50=-2,
wher.mu=0.004, wher.sigma=0.7,
bwerm.mu=1.75, bwerm.sigma=0.103 )
pipeline.vary <- data.table(v.r=c(6.305779, 5.154461, 4.003191, 2.851904), label=c('-n5','-n4','-n3','-n2'))
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='v.r' ), 'v', as.character(v.r))
print(pipeline.args)
# re-name the following:
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp140915b'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
#
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
system(file)
}, by='label']
}
if(0)
{
indir <- system.file(package="PANGEA.HIV.sim", "misc")
infile.ind <- '140716_RUN001'
infile.trm <- '140716_RUN001'
pipeline.args <- sim.regional.args( yr.start=1980, yr.end=2020, seed=42,
s.INC.recent=0.1, s.INC.recent.len=5, s.PREV.min=0.01, s.PREV.max=0.2,
epi.model='HPTN071', epi.dt=1/48, epi.import=0.1,
v.N0tau=1, v.r=2.851904, v.T50=-2,
wher.mu=NA, wher.sigma=NA,
bwerm.mu=4, bwerm.sigma=0.06 )
pipeline.vary <- data.table(wher.mu=exp(c(-5.477443+0.18, -5.071977+0.08, -4.784295+0.045)), wher.sigma=c(0.6, 0.4, 0.3), label=c('-2','-3','-4'))
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='wher.mu' ), 'v', as.character(wher.mu))
set(pipeline.args, which( pipeline.args$stat=='wher.sigma' ), 'v', as.character(wher.sigma))
print(pipeline.args)
# re-name the following:
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp140916'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
#
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
system(file)
}, by='label']
}
# changed: transmission edge model; fixed: transmission edges are within transmitter
if(0)
{
indir <- system.file(package="PANGEA.HIV.sim", "misc")
infile.ind <- '140716_RUN001'
infile.trm <- '140716_RUN001'
pipeline.args <- sim.regional.args( yr.start=1980, yr.end=2020, seed=42,
s.INC.recent=0.1, s.INC.recent.len=5, s.PREV.min=0.01, s.PREV.max=0.2,
epi.model='HPTN071', epi.dt=1/48, epi.import=0.1,
v.N0tau=1, v.r=2.851904, v.T50=-2,
wher.mu=exp(-5.071977+0.08), wher.sigma=0.4,
bwerm.mu=NA,
bwerm.sigma=NA)
# bwerm.mu is now shift
pipeline.vary <- data.table(bwerm.mu=c(0, -2e-4, -4e-4, -6e-4, -8e-4, -1e-3), label=c('-s0','-s2','-s4','-s6','-s8','-s10'))
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='bwerm.mu' ), 'v', as.character(bwerm.mu))
print(pipeline.args)
# re-name the following:
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp140916b'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
#
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
system(file)
}, by='label']
# rerun: bug?? NOPE!
indir <- system.file(package="PANGEA.HIV.sim", "misc")
indir <- ifelse(indir=='','/Users/Oliver/git/HPTN071sim/raw_trchain',indir)
infile.ind <- '140716_RUN001'
infile.trm <- '140716_RUN001'
pipeline.args <- sim.regional.args( yr.start=1980, yr.end=2020, seed=42,
s.INC.recent=0.1, s.INC.recent.len=5, s.PREV.min=0.01, s.PREV.max=0.2,
epi.model='HPTN071', epi.dt=1/48, epi.import=0.1,
v.N0tau=1, v.r=2.851904, v.T50=-2,
wher.mu=exp(-5.071977+0.08), wher.sigma=0.4,
bwerm.mu=NA,
bwerm.sigma=NA)
# bwerm.mu is now shift
pipeline.vary <- data.table(bwerm.mu=c(0, -5e-4, -1e-3), label=c('-s0','-s5','-s10'))
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='bwerm.mu' ), 'v', as.character(bwerm.mu))
print(pipeline.args)
# re-name the following:
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp140917'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
#
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
system(file)
}, by='label']
}
# changed: only edges leading to tip are under within host evol (dead ends); added sdlog to transmission ER model; changed params to log scale
if(0)
{
indir <- system.file(package="PANGEA.HIV.sim", "misc")
infile.ind <- '140716_RUN001'
infile.trm <- '140716_RUN001'
# scenario 3* faster WH up to 0.02
pipeline.args <- sim.regional.args( yr.start=1980, yr.end=2020, seed=42,
s.INC.recent=0.1, s.INC.recent.len=5, s.PREV.min=0.01, s.PREV.max=0.2,
epi.model='HPTN071', epi.dt=1/48, epi.import=0.1,
v.N0tau=1, v.r=2.851904, v.T50=-2,
wher.mu=log(0.006716145)-0.37^2/2, wher.sigma=0.37,
bwerm.mu=NA,
bwerm.sigma=NA)
# bwerm.mu is now shift
pipeline.vary <- data.table(bwerm.mu=c(log(0.002239075)-0.05^2/2, log(0.002239075-0.0005)-0.065^2/2, log(0.002239075-0.001)-0.09^2/2), bwerm.sigma=c(0.05, 0.065, 0.09), label=c('-sh0','-sh5','-sh10'))
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='bwerm.mu' ), 'v', as.character(bwerm.mu))
set(pipeline.args, which( pipeline.args$stat=='bwerm.sigma' ), 'v', as.character(bwerm.sigma))
print(pipeline.args)
# re-name the following:
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp140918'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
#
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
system(file)
}, by='label']
# scenario 2* faster WH limited to 0.01
pipeline.args <- sim.regional.args( yr.start=1980, yr.end=2020, seed=42,
s.INC.recent=0.1, s.INC.recent.len=5, s.PREV.min=0.01, s.PREV.max=0.2,
epi.model='HPTN071', epi.dt=1/48, epi.import=0.1,
v.N0tau=1, v.r=2.851904, v.T50=-2,
wher.mu=log(0.00447743)-0.3^2/2, wher.sigma=0.3,
bwerm.mu=NA, bwerm.sigma=NA)
# bwerm.mu is now shift
pipeline.vary <- data.table(bwerm.mu=c(log(0.002239075)-0.05^2/2, log(0.002239075-0.0005)-0.065^2/2, log(0.002239075-0.001)-0.09^2/2), bwerm.sigma=c(0.05, 0.065, 0.09), label=c('-sl0','-sl5','-sl10'))
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='bwerm.mu' ), 'v', as.character(bwerm.mu))
set(pipeline.args, which( pipeline.args$stat=='bwerm.sigma' ), 'v', as.character(bwerm.sigma))
print(pipeline.args)
# re-name the following:
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp140918'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
#
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
system(file)
}, by='label']
}
# changed: time to seq limited to max 6 years instead of 35 ;-); sample fraction 10%
if(0)
{
indir <- system.file(package="PANGEA.HIV.sim", "misc")
infile.ind <- '170914_HPTN071_scA_rep1'
infile.trm <- '170914_HPTN071_scA_rep1'
# scenario 2* faster WH limited to 0.01
pipeline.args <- sim.regional.args( yr.start=1980, yr.end=2020, seed=42,
s.INC.recent=0.1, s.INC.recent.len=5, s.PREV.min=0.01, s.PREV.max=0.1,
epi.model='HPTN071', epi.dt=1/48, epi.import=0.1,
v.N0tau=1, v.r=2.851904, v.T50=-2,
wher.mu=log(0.00447743)-0.3^2/2, wher.sigma=0.3,
bwerm.mu=NA, bwerm.sigma=NA)
# bwerm.mu is now shift
pipeline.vary <- data.table(bwerm.mu=c(log(0.002239075)-0.05^2/2, log(0.002239075-0.0005)-0.065^2/2, log(0.002239075-0.001)-0.09^2/2), bwerm.sigma=c(0.05, 0.065, 0.09), label=c('-sl0','-sl5','-sl10'))
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='bwerm.mu' ), 'v', as.character(bwerm.mu))
set(pipeline.args, which( pipeline.args$stat=='bwerm.sigma' ), 'v', as.character(bwerm.sigma))
print(pipeline.args)
# re-name the following:
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp140918b'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
#
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
system(file)
}, by='label']
}
if(0)
{
indir <- system.file(package="PANGEA.HIV.sim", "misc")
infile.ind <- '170914_HPTN071_scA_rep1'
infile.trm <- '170914_HPTN071_scA_rep1'
# scenario 2* faster WH limited to 0.01
pipeline.args <- sim.regional.args( yr.start=1980, yr.end=2020, seed=42,
s.INC.recent=0.1, s.INC.recent.len=5, s.PREV.min=0.01, s.PREV.max=0.1,
epi.model='HPTN071', epi.dt=1/48, epi.import=0.1,
v.N0tau=1, v.r=2.851904, v.T50=-2,
wher.mu=NA, wher.sigma=NA,
bwerm.mu=log(0.002239075)-0.13^2/2, bwerm.sigma=0.13)
# bwerm.mu is now shift
pipeline.vary <- data.table(wher.mu=c(log(0.00447743)-0.3^2/2, log(0.00447743)-0.4^2/2, log(0.00447743)-0.5^2/2), wher.sigma=c(0.3, 0.4, 0.5), label=c('-w3','-w4','-w5'))
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='wher.mu' ), 'v', as.character(wher.mu))
set(pipeline.args, which( pipeline.args$stat=='wher.sigma' ), 'v', as.character(wher.sigma))
print(pipeline.args)
# re-name the following:
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp140918c'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
#
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
system(file)
}, by='label']
}
# sense check on CoV
if(0)
{
indir <- system.file(package="PANGEA.HIV.sim", "misc")
infile.ind <- '180914_HPTN071_scA_rep1'
infile.trm <- '180914_HPTN071_scA_rep1'
pipeline.args <- sim.regional.args( yr.start=1980, yr.end=2020, seed=42,
s.INC.recent=0.1, s.INC.recent.len=5, s.PREV.min=0.01, s.PREV.max=0.1,
epi.model='HPTN071', epi.dt=1/48, epi.import=0.1,
v.N0tau=1, v.r=2.851904, v.T50=-2,
wher.mu=NA, wher.sigma=NA, bwerm.mu=NA, bwerm.sigma=NA)
# no WH Er to equal BH ER, and both being constant
pipeline.vary <- data.table(wher.mu=c(log(0.002239075)-0.13^2/2, log(0.002239075)-0.01^2/2), wher.sigma=c(0.13, 0.01), label=c('-c13','-c0'))
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='wher.mu' ), 'v', as.character(wher.mu))
set(pipeline.args, which( pipeline.args$stat=='wher.sigma' ), 'v', as.character(wher.sigma))
set(pipeline.args, which( pipeline.args$stat=='bwerm.mu' ), 'v', as.character(wher.mu))
set(pipeline.args, which( pipeline.args$stat=='bwerm.sigma' ), 'v', as.character(wher.sigma))
print(pipeline.args)
# re-name the following:
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp140919'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
#
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
system(file)
}, by='label']
}
# changed: ignore variation in meanRate by gene, added debug options
if(0)
{
indir <- system.file(package="PANGEA.HIV.sim", "misc")
infile.ind <- '180914_HPTN071_scA_rep1'
infile.trm <- '180914_HPTN071_scA_rep1'
pipeline.args <- sim.regional.args( yr.start=1980, yr.end=2020, seed=42,
s.INC.recent=0.1, s.INC.recent.len=5, s.PREV.min=0.01, s.PREV.max=0.1,
epi.model='HPTN071', epi.dt=1/48, epi.import=0.1,
v.N0tau=1, v.r=2.851904, v.T50=-2,
wher.mu=NA, wher.sigma=NA, bwerm.mu=NA, bwerm.sigma=NA,
dbg.GTRparam=1, dbg.rER=1)
# standard run, fixed GTR param + fixed relative rates for each transmission chain
# no WH Er to equal BH ER, and both being constant
pipeline.vary <- data.table( wher.mu=c(log(0.00447743)-0.3^2/2, log(0.002239075)-0.13^2/2, log(0.002239075)-0.01^2/2), wher.sigma=c(0.3, 0.13, 0.01),
bwerm.mu=c(log(0.002239075)-0.13^2/2, log(0.002239075)-0.13^2/2, log(0.002239075)-0.01^2/2), bwerm.sigma=c(0.13, 0.13, 0.01), label=c('-d1st','-d1s','-d1f'))
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='wher.mu' ), 'v', as.character(wher.mu))
set(pipeline.args, which( pipeline.args$stat=='wher.sigma' ), 'v', as.character(wher.sigma))
set(pipeline.args, which( pipeline.args$stat=='bwerm.mu' ), 'v', as.character(bwerm.mu))
set(pipeline.args, which( pipeline.args$stat=='bwerm.sigma' ), 'v', as.character(bwerm.sigma))
print(pipeline.args)
# re-name the following:
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp140920a'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
#
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
system(file)
}, by='label']
# debug only GTRparam
pipeline.args <- sim.regional.args( yr.start=1980, yr.end=2020, seed=42,
s.INC.recent=0.1, s.INC.recent.len=5, s.PREV.min=0.01, s.PREV.max=0.1,
epi.model='HPTN071', epi.dt=1/48, epi.import=0.1,
v.N0tau=1, v.r=2.851904, v.T50=-2,
wher.mu=NA, wher.sigma=NA, bwerm.mu=NA, bwerm.sigma=NA,
dbg.GTRparam=1, dbg.rER=0)
# standard run, fixed GTR param + fixed relative rates for each transmission chain
# no WH Er to equal BH ER, and both being constant
pipeline.vary <- data.table( wher.mu=c(log(0.00447743)-0.3^2/2, log(0.002239075)-0.13^2/2, log(0.002239075)-0.01^2/2), wher.sigma=c(0.3, 0.13, 0.01),
bwerm.mu=c(log(0.002239075)-0.13^2/2, log(0.002239075)-0.13^2/2, log(0.002239075)-0.01^2/2), bwerm.sigma=c(0.13, 0.13, 0.01), label=c('-d10st','-d10s','-d10f'))
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='wher.mu' ), 'v', as.character(wher.mu))
set(pipeline.args, which( pipeline.args$stat=='wher.sigma' ), 'v', as.character(wher.sigma))
set(pipeline.args, which( pipeline.args$stat=='bwerm.mu' ), 'v', as.character(bwerm.mu))
set(pipeline.args, which( pipeline.args$stat=='bwerm.sigma' ), 'v', as.character(bwerm.sigma))
print(pipeline.args)
# re-name the following:
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp140920a'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
#
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
system(file)
}, by='label']
# no debug
pipeline.args <- sim.regional.args( yr.start=1980, yr.end=2020, seed=42,
s.INC.recent=0.1, s.INC.recent.len=5, s.PREV.min=0.01, s.PREV.max=0.1,
epi.model='HPTN071', epi.dt=1/48, epi.import=0.1,
v.N0tau=1, v.r=2.851904, v.T50=-2,
wher.mu=NA, wher.sigma=NA, bwerm.mu=NA, bwerm.sigma=NA,
dbg.GTRparam=0, dbg.rER=0)
# standard run, fixed GTR param + fixed relative rates for each transmission chain
# no WH Er to equal BH ER, and both being constant
pipeline.vary <- data.table( wher.mu=c(log(0.00447743)-0.3^2/2, log(0.002239075)-0.13^2/2, log(0.002239075)-0.01^2/2), wher.sigma=c(0.3, 0.13, 0.01),
bwerm.mu=c(log(0.002239075)-0.13^2/2, log(0.002239075)-0.13^2/2, log(0.002239075)-0.01^2/2), bwerm.sigma=c(0.13, 0.13, 0.01), label=c('-d00st','-d00s','-d00f'))
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='wher.mu' ), 'v', as.character(wher.mu))
set(pipeline.args, which( pipeline.args$stat=='wher.sigma' ), 'v', as.character(wher.sigma))
set(pipeline.args, which( pipeline.args$stat=='bwerm.mu' ), 'v', as.character(bwerm.mu))
set(pipeline.args, which( pipeline.args$stat=='bwerm.sigma' ), 'v', as.character(bwerm.sigma))
print(pipeline.args)
# re-name the following:
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp140920a'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
#
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
system(file)
}, by='label']
}
# debug 11, no import
if(0)
{
indir <- system.file(package="PANGEA.HIV.sim", "misc")
infile.ind <- '180914_HPTN071_scA_rep1'
infile.trm <- '180914_HPTN071_scA_rep1'
pipeline.args <- sim.regional.args( yr.start=1980, yr.end=2020, seed=42,
s.INC.recent=0.1, s.INC.recent.len=5, s.PREV.min=0.01, s.PREV.max=0.1,
epi.model='HPTN071', epi.dt=1/48, epi.import=0,
v.N0tau=1, v.r=2.851904, v.T50=-2,
wher.mu=NA, wher.sigma=NA, bwerm.mu=NA, bwerm.sigma=NA,
dbg.GTRparam=1, dbg.rER=1)
# standard run, fixed GTR param + fixed relative rates for each transmission chain
# no WH Er to equal BH ER, and both being constant
pipeline.vary <- data.table( wher.mu=c(log(0.00447743)-0.3^2/2, log(0.002239075)-0.13^2/2, log(0.002239075)-0.01^2/2), wher.sigma=c(0.3, 0.13, 0.01),
bwerm.mu=c(log(0.002239075)-0.13^2/2, log(0.002239075)-0.13^2/2, log(0.002239075)-0.01^2/2), bwerm.sigma=c(0.13, 0.13, 0.01), label=c('-i11st','-i11s','-i11f'))
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='wher.mu' ), 'v', as.character(wher.mu))
set(pipeline.args, which( pipeline.args$stat=='wher.sigma' ), 'v', as.character(wher.sigma))
set(pipeline.args, which( pipeline.args$stat=='bwerm.mu' ), 'v', as.character(bwerm.mu))
set(pipeline.args, which( pipeline.args$stat=='bwerm.sigma' ), 'v', as.character(bwerm.sigma))
print(pipeline.args)
# re-name the following:
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp140921b'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
#
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
system(file)
}, by='label']
}
# debug 11, no import, index.starttime.mode=fix
if(0)
{
indir <- system.file(package="PANGEA.HIV.sim", "misc")
infile.ind <- '180914_HPTN071_scA_rep1'
infile.trm <- '180914_HPTN071_scA_rep1'
pipeline.args <- sim.regional.args( yr.start=1980, yr.end=2020, seed=42,
s.INC.recent=0.1, s.INC.recent.len=5, s.PREV.min=0.01, s.PREV.max=0.1,
epi.model='HPTN071', epi.dt=1/48, epi.import=0,
v.N0tau=1, v.r=2.851904, v.T50=-2,
wher.mu=NA, wher.sigma=NA, bwerm.mu=NA, bwerm.sigma=NA,
dbg.GTRparam=1, dbg.rER=1, index.starttime.mode='fix')
# standard run, fixed GTR param + fixed relative rates for each transmission chain
# no WH Er to equal BH ER, and both being constant
pipeline.vary <- data.table( wher.mu=c(log(0.00447743)-0.3^2/2, log(0.002239075)-0.13^2/2, log(0.002239075)-0.01^2/2), wher.sigma=c(0.3, 0.13, 0.01),
bwerm.mu=c(log(0.002239075)-0.13^2/2, log(0.002239075)-0.13^2/2, log(0.002239075)-0.01^2/2), bwerm.sigma=c(0.13, 0.13, 0.01), label=c('-S11st','-S11s','-S11f'))
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='wher.mu' ), 'v', as.character(wher.mu))
set(pipeline.args, which( pipeline.args$stat=='wher.sigma' ), 'v', as.character(wher.sigma))
set(pipeline.args, which( pipeline.args$stat=='bwerm.mu' ), 'v', as.character(bwerm.mu))
set(pipeline.args, which( pipeline.args$stat=='bwerm.sigma' ), 'v', as.character(bwerm.sigma))
print(pipeline.args)
# re-name the following:
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp140921b'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
#
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
system(file)
}, by='label']
}
# debug 11, 10% import, index.starttime.mode=fix
if(0)
{
indir <- system.file(package="PANGEA.HIV.sim", "misc")
infile.ind <- '180914_HPTN071_scA_rep1'
infile.trm <- '180914_HPTN071_scA_rep1'
pipeline.args <- sim.regional.args( yr.start=1980, yr.end=2020, seed=42,
s.INC.recent=0.1, s.INC.recent.len=5, s.PREV.min=0.01, s.PREV.max=0.1,
epi.model='HPTN071', epi.dt=1/48, epi.import=0.1,
v.N0tau=1, v.r=2.851904, v.T50=-2,
wher.mu=NA, wher.sigma=NA, bwerm.mu=NA, bwerm.sigma=NA,
dbg.GTRparam=1, dbg.rER=1, index.starttime.mode='fix')
# standard run, fixed GTR param + fixed relative rates for each transmission chain
# no WH Er to equal BH ER, and both being constant
pipeline.vary <- data.table( wher.mu=c(log(0.00447743)-0.3^2/2, log(0.002239075)-0.13^2/2, log(0.002239075)-0.01^2/2), wher.sigma=c(0.3, 0.13, 0.01),
bwerm.mu=c(log(0.002239075)-0.13^2/2, log(0.002239075)-0.13^2/2, log(0.002239075)-0.01^2/2), bwerm.sigma=c(0.13, 0.13, 0.01), label=c('-w11st','-w11s','-w11f'))
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='wher.mu' ), 'v', as.character(wher.mu))
set(pipeline.args, which( pipeline.args$stat=='wher.sigma' ), 'v', as.character(wher.sigma))
set(pipeline.args, which( pipeline.args$stat=='bwerm.mu' ), 'v', as.character(bwerm.mu))
set(pipeline.args, which( pipeline.args$stat=='bwerm.sigma' ), 'v', as.character(bwerm.sigma))
print(pipeline.args)
# re-name the following:
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp140921b'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
#
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
system(file)
}, by='label']
}
# debug 11, no import, index.starttime.mode=fix, no rate variation
if(0)
{
indir <- system.file(package="PANGEA.HIV.sim", "misc")
infile.ind <- '180914_HPTN071_scA_rep1'
infile.trm <- '180914_HPTN071_scA_rep1'
pipeline.args <- sim.regional.args( yr.start=1980, yr.end=2020, seed=42,
s.INC.recent=0.1, s.INC.recent.len=5, s.PREV.min=0.01, s.PREV.max=0.1,
epi.model='HPTN071', epi.dt=1/48, epi.import=0,
v.N0tau=1, v.r=2.851904, v.T50=-2,
wher.mu=NA, wher.sigma=NA, bwerm.mu=NA, bwerm.sigma=NA, er.gamma=0,
dbg.GTRparam=1, dbg.rER=1, index.starttime.mode='fix')
# standard run, fixed GTR param + fixed relative rates for each transmission chain
# no WH Er to equal BH ER, and both being constant
pipeline.vary <- data.table( wher.mu=c(log(0.00447743)-0.3^2/2, log(0.002239075)-0.13^2/2, log(0.002239075)-0.01^2/2), wher.sigma=c(0.3, 0.13, 0.01),
bwerm.mu=c(log(0.002239075)-0.13^2/2, log(0.002239075)-0.13^2/2, log(0.002239075)-0.01^2/2), bwerm.sigma=c(0.13, 0.13, 0.01), label=c('-R11st','-R11s','-R11f'))
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='wher.mu' ), 'v', as.character(wher.mu))
set(pipeline.args, which( pipeline.args$stat=='wher.sigma' ), 'v', as.character(wher.sigma))
set(pipeline.args, which( pipeline.args$stat=='bwerm.mu' ), 'v', as.character(bwerm.mu))
set(pipeline.args, which( pipeline.args$stat=='bwerm.sigma' ), 'v', as.character(bwerm.sigma))
print(pipeline.args)
# re-name the following:
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp140922'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
#
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
system(file)
}, by='label']
}
# debug 11, no import, index.starttime.mode=shift, no rate variation
if(0)
{
indir <- system.file(package="PANGEA.HIV.sim", "misc")
infile.ind <- '180914_HPTN071_scA_rep1'
infile.trm <- '180914_HPTN071_scA_rep1'
pipeline.args <- sim.regional.args( yr.start=1980, yr.end=2020, seed=42,
s.INC.recent=0.1, s.INC.recent.len=5, s.PREV.min=0.01, s.PREV.max=0.1,
epi.model='HPTN071', epi.dt=1/48, epi.import=0,
v.N0tau=1, v.r=2.851904, v.T50=-2,
wher.mu=NA, wher.sigma=NA, bwerm.mu=NA, bwerm.sigma=NA, er.gamma=0,
dbg.GTRparam=1, dbg.rER=1, index.starttime.mode='shift')
# standard run, fixed GTR param + fixed relative rates for each transmission chain
# no WH Er to equal BH ER, and both being constant
pipeline.vary <- data.table( wher.mu=c(log(0.00447743)-0.3^2/2, log(0.002239075)-0.13^2/2, log(0.002239075)-0.01^2/2), wher.sigma=c(0.3, 0.13, 0.01),
bwerm.mu=c(log(0.002239075)-0.13^2/2, log(0.002239075)-0.13^2/2, log(0.002239075)-0.01^2/2), bwerm.sigma=c(0.13, 0.13, 0.01), label=c('-sh11st','-sh11s','-sh11f'))
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='wher.mu' ), 'v', as.character(wher.mu))
set(pipeline.args, which( pipeline.args$stat=='wher.sigma' ), 'v', as.character(wher.sigma))
set(pipeline.args, which( pipeline.args$stat=='bwerm.mu' ), 'v', as.character(bwerm.mu))
set(pipeline.args, which( pipeline.args$stat=='bwerm.sigma' ), 'v', as.character(bwerm.sigma))
print(pipeline.args)
# re-name the following:
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp140923'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
#
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
system(file)
}, by='label']
}
# debug 11, no import, same starting seq, index.starttime.mode=shift, no rate variation,
if(0)
{
indir <- system.file(package="PANGEA.HIV.sim", "misc")
infile.ind <- '180914_HPTN071_scA_rep1'
infile.trm <- '180914_HPTN071_scA_rep1'
pipeline.args <- sim.regional.args( yr.start=1980, yr.end=2020, seed=42,
s.INC.recent=0.1, s.INC.recent.len=5, s.PREV.min=0.01, s.PREV.max=0.1,
epi.model='HPTN071', epi.dt=1/48, epi.import=0,
v.N0tau=1, v.r=2.851904, v.T50=-2,
wher.mu=NA, wher.sigma=NA, bwerm.mu=NA, bwerm.sigma=NA, er.gamma=0,
dbg.GTRparam=1, dbg.rER=1, index.starttime.mode='shift', startseq.mode='fix')
# standard run, fixed GTR param + fixed relative rates for each transmission chain
# no WH Er to equal BH ER, and both being constant
pipeline.vary <- data.table( wher.mu=c(log(0.00447743)-0.3^2/2, log(0.002239075)-0.13^2/2, log(0.002239075)-0.01^2/2), wher.sigma=c(0.3, 0.13, 0.01),
bwerm.mu=c(log(0.002239075)-0.13^2/2, log(0.002239075)-0.13^2/2, log(0.002239075)-0.01^2/2), bwerm.sigma=c(0.13, 0.13, 0.01), label=c('-z11st','-z11s','-z11f'))
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='wher.mu' ), 'v', as.character(wher.mu))
set(pipeline.args, which( pipeline.args$stat=='wher.sigma' ), 'v', as.character(wher.sigma))
set(pipeline.args, which( pipeline.args$stat=='bwerm.mu' ), 'v', as.character(bwerm.mu))
set(pipeline.args, which( pipeline.args$stat=='bwerm.sigma' ), 'v', as.character(bwerm.sigma))
print(pipeline.args)
# re-name the following:
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp140924'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
#
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
system(file)
}, by='label']
indir <- system.file(package="PANGEA.HIV.sim", "misc")
infile.ind <- '180914_HPTN071_scA_rep1'
infile.trm <- '180914_HPTN071_scA_rep1'
pipeline.args <- sim.regional.args( yr.start=1980, yr.end=2020, seed=42,
s.INC.recent=0.1, s.INC.recent.len=5, s.PREV.min=0.01, s.PREV.max=0.1,
epi.model='HPTN071', epi.dt=1/48, epi.import=0,
v.N0tau=1, v.r=2.851904, v.T50=-2,
wher.mu=NA, wher.sigma=NA, bwerm.mu=NA, bwerm.sigma=NA, er.gamma=0,
dbg.GTRparam=1, dbg.rER=1, index.starttime.mode='fix', startseq.mode='fix')
# standard run, fixed GTR param + fixed relative rates for each transmission chain
# no WH Er to equal BH ER, and both being constant
pipeline.vary <- data.table( wher.mu=c(log(0.00447743)-0.3^2/2, log(0.002239075)-0.13^2/2, log(0.002239075)-0.01^2/2), wher.sigma=c(0.3, 0.13, 0.01),
bwerm.mu=c(log(0.002239075)-0.13^2/2, log(0.002239075)-0.13^2/2, log(0.002239075)-0.01^2/2), bwerm.sigma=c(0.13, 0.13, 0.01), label=c('-f11st','-f11s','-f11f'))
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='wher.mu' ), 'v', as.character(wher.mu))
set(pipeline.args, which( pipeline.args$stat=='wher.sigma' ), 'v', as.character(wher.sigma))
set(pipeline.args, which( pipeline.args$stat=='bwerm.mu' ), 'v', as.character(bwerm.mu))
set(pipeline.args, which( pipeline.args$stat=='bwerm.sigma' ), 'v', as.character(bwerm.sigma))
print(pipeline.args)
# re-name the following:
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp140924'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
#
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
system(file)
}, by='label']
}
# try and finalize:
# #startseq=1, starttime~1960, k=4 (did not make a big difference), standard run, import 2.5% 5% GTR 00/11
if(0)
{
indir <- system.file(package="PANGEA.HIV.sim", "misc")
infile.ind <- '180914_HPTN071_scA_rep1'
infile.trm <- '180914_HPTN071_scA_rep1'
pipeline.args <- sim.regional.args( yr.start=1980, yr.end=2020, seed=42,
s.INC.recent=0.1, s.INC.recent.len=5, s.PREV.min=0.01, s.PREV.max=0.1,
epi.model='HPTN071', epi.dt=1/48, epi.import=NA,
v.N0tau=1, v.r=2.851904, v.T50=-2,
wher.mu=log(0.00447743)-0.3^2/2, wher.sigma=0.3, bwerm.mu=log(0.002239075)-0.13^2/2, bwerm.sigma=0.13, er.gamma=4,
dbg.GTRparam=1, dbg.rER=1, index.starttime.mode='fix', startseq.mode='fix')
# standard run, fixed GTR param + fixed relative rates for each transmission chain
# no WH Er to equal BH ER, and both being constant
pipeline.vary <- data.table( epi.import=c(0, 0.025, 0.05), label=c('-i110','-i112','-i115'))
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='epi.import' ), 'v', as.character(epi.import))
print(pipeline.args)
# re-name the following:
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp140925'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
#
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
system(file)
}, by='label']
#
pipeline.args <- sim.regional.args( yr.start=1980, yr.end=2020, seed=42,
s.INC.recent=0.1, s.INC.recent.len=5, s.PREV.min=0.01, s.PREV.max=0.1,
epi.model='HPTN071', epi.dt=1/48, epi.import=NA,
v.N0tau=1, v.r=2.851904, v.T50=-2,
wher.mu=log(0.00447743)-0.3^2/2, wher.sigma=0.3, bwerm.mu=log(0.002239075)-0.13^2/2, bwerm.sigma=0.13, er.gamma=4,
dbg.GTRparam=0, dbg.rER=0, index.starttime.mode='fix', startseq.mode='fix')
# standard run, fixed GTR param + fixed relative rates for each transmission chain
# no WH Er to equal BH ER, and both being constant
pipeline.vary <- data.table( epi.import=c(0, 0.025, 0.05), label=c('-i000','-i002','-i005'))
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='epi.import' ), 'v', as.character(epi.import))
print(pipeline.args)
# re-name the following:
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp140925'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
#
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
system(file)
}, by='label']
# try improve TMRCA?
pipeline.args <- sim.regional.args( yr.start=1980, yr.end=2020, seed=42,
s.INC.recent=0.1, s.INC.recent.len=5, s.PREV.min=0.01, s.PREV.max=0.1,
epi.model='HPTN071', epi.dt=1/48, epi.import=NA,
v.N0tau=1, v.r=2.851904, v.T50=-2,
wher.mu=log(0.00447743)-0.3^2/2, wher.sigma=0.3, bwerm.mu=log(0.002239075)-0.13^2/2, bwerm.sigma=0.13, er.gamma=4,
dbg.GTRparam=1, dbg.rER=1, index.starttime.mode='fix45', startseq.mode='fix')
# standard run, fixed GTR param + fixed relative rates for each transmission chain
# no WH Er to equal BH ER, and both being constant
pipeline.vary <- data.table( epi.import=c(0, 0.025, 0.05), label=c('-e110','-e112','-e115'))
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='epi.import' ), 'v', as.character(epi.import))
print(pipeline.args)
# re-name the following:
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp140926'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
#
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
system(file)
}, by='label']
}
# bugfix in SeqGen NEWICK file
if(0)
{
indir <- system.file(package="PANGEA.HIV.sim", "misc")
infile.ind <- '180914_HPTN071_scA_rep1'
infile.trm <- '180914_HPTN071_scA_rep1'
pipeline.args <- sim.regional.args( yr.start=1980, yr.end=2020, seed=42,
s.INC.recent=0.1, s.INC.recent.len=5, s.PREV.min=0.01, s.PREV.max=0.1,
epi.model='HPTN071', epi.dt=1/48, epi.import=NA,
v.N0tau=1, v.r=2.851904, v.T50=-2,
wher.mu=log(0.00447743)-0.3^2/2, wher.sigma=0.3, bwerm.mu=log(0.002239075)-0.13^2/2, bwerm.sigma=0.13, er.gamma=4,
dbg.GTRparam=1, dbg.rER=1, index.starttime.mode='normal', startseq.mode='sample')
# standard run, fixed GTR param + fixed relative rates for each transmission chain
# no WH Er to equal BH ER, and both being constant
pipeline.vary <- data.table( epi.import=c(0, 0.05, 0.1), label=c('-n110','-n115','-n111'))
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='epi.import' ), 'v', as.character(epi.import))
print(pipeline.args)
# re-name the following:
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp140926'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
#
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
system(file)
}, by='label']
pipeline.args <- sim.regional.args( yr.start=1980, yr.end=2020, seed=42,
s.INC.recent=0.1, s.INC.recent.len=5, s.PREV.min=0.01, s.PREV.max=0.1,
epi.model='HPTN071', epi.dt=1/48, epi.import=NA,
v.N0tau=1, v.r=2.851904, v.T50=-2,
wher.mu=log(0.00447743)-0.3^2/2, wher.sigma=0.3, bwerm.mu=log(0.002239075)-0.13^2/2, bwerm.sigma=0.13, er.gamma=4,
dbg.GTRparam=1, dbg.rER=1, index.starttime.mode='fix', startseq.mode='sample')
# standard run, fixed GTR param + fixed relative rates for each transmission chain
# no WH Er to equal BH ER, and both being constant
pipeline.vary <- data.table( epi.import=c(0, 0.05, 0.1), label=c('-f110','-f115','-f111'))
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='epi.import' ), 'v', as.character(epi.import))
print(pipeline.args)
# re-name the following:
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp140926'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
#
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
system(file)
}, by='label']
pipeline.args <- sim.regional.args( yr.start=1980, yr.end=2020, seed=42,
s.INC.recent=0.1, s.INC.recent.len=5, s.PREV.min=0.01, s.PREV.max=0.1,
epi.model='HPTN071', epi.dt=1/48, epi.import=NA,
v.N0tau=1, v.r=2.851904, v.T50=-2,
wher.mu=log(0.00447743)-0.3^2/2, wher.sigma=0.3, bwerm.mu=log(0.002239075)-0.13^2/2, bwerm.sigma=0.13, er.gamma=4,
dbg.GTRparam=1, dbg.rER=1, index.starttime.mode='fix', startseq.mode='fix')
# standard run, fixed GTR param + fixed relative rates for each transmission chain
# no WH Er to equal BH ER, and both being constant
pipeline.vary <- data.table( epi.import=c(0, 0.05, 0.1), label=c('-s110','-s115','-s111'))
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='epi.import' ), 'v', as.character(epi.import))
print(pipeline.args)
# re-name the following:
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp140926'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
#
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
system(file)
}, by='label']
pipeline.args <- sim.regional.args( yr.start=1980, yr.end=2020, seed=42,
s.INC.recent=0.1, s.INC.recent.len=5, s.PREV.min=0.01, s.PREV.max=0.1,
epi.model='HPTN071', epi.dt=1/48, epi.import=NA,
v.N0tau=1, v.r=2.851904, v.T50=-2,
wher.mu=log(0.00447743)-0.3^2/2, wher.sigma=0.3, bwerm.mu=log(0.002239075)-0.13^2/2, bwerm.sigma=0.13, er.gamma=4,
dbg.GTRparam=1, dbg.rER=1, index.starttime.mode='fix45', startseq.mode='fix')
# standard run, fixed GTR param + fixed relative rates for each transmission chain
# no WH Er to equal BH ER, and both being constant
pipeline.vary <- data.table( epi.import=c(0, 0.05, 0.1), label=c('-e110','-e115','-e111'))
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='epi.import' ), 'v', as.character(epi.import))
print(pipeline.args)
# re-name the following:
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp140926'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
#
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
system(file)
}, by='label']
}
# try and finalize simulations
if(0)
{
indir <- system.file(package="PANGEA.HIV.sim", "misc")
infile.ind <- '180914_HPTN071_scA_rep1'
infile.trm <- '180914_HPTN071_scA_rep1'
pipeline.args <- sim.regional.args( yr.start=1980, yr.end=2020, seed=42,
s.INC.recent=0.1, s.INC.recent.len=5, s.PREV.min=0.01, s.PREV.max=0.1,
epi.model='HPTN071', epi.dt=1/48, epi.import=NA,
v.N0tau=1, v.r=2.851904, v.T50=-2,
wher.mu=log(0.00447743)-0.3^2/2, wher.sigma=0.3, bwerm.mu=log(0.002239075)-0.13^2/2, bwerm.sigma=0.13, er.gamma=0,
dbg.GTRparam=1, dbg.rER=1, index.starttime.mode='fix', startseq.mode='sample')
# standard run, fixed GTR param + fixed relative rates for each transmission chain
# no WH Er to equal BH ER, and both being constant
pipeline.vary <- data.table( epi.import=c(0, 0.05, 0.1), label=c('-f0110','-f0115','-f0111'))
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='epi.import' ), 'v', as.character(epi.import))
print(pipeline.args)
# re-name the following:
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp140927'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
#
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
system(file)
}, by='label']
pipeline.args <- sim.regional.args( yr.start=1980, yr.end=2020, seed=42,
s.INC.recent=0.1, s.INC.recent.len=5, s.PREV.min=0.01, s.PREV.max=0.1,
epi.model='HPTN071', epi.dt=1/48, epi.import=NA,
v.N0tau=1, v.r=2.851904, v.T50=-2,
wher.mu=log(0.00447743)-0.3^2/2, wher.sigma=0.3, bwerm.mu=log(0.002239075)-0.13^2/2, bwerm.sigma=0.13, er.gamma=0,
dbg.GTRparam=0, dbg.rER=0, index.starttime.mode='fix', startseq.mode='sample')
# standard run, fixed GTR param + fixed relative rates for each transmission chain
# no WH Er to equal BH ER, and both being constant
pipeline.vary <- data.table( epi.import=c(0, 0.05, 0.1), label=c('-f0000','-f0005','-f0001'))
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='epi.import' ), 'v', as.character(epi.import))
print(pipeline.args)
# re-name the following:
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp140927'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
#
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
system(file)
}, by='label']
pipeline.args <- sim.regional.args( yr.start=1980, yr.end=2020, seed=42,
s.INC.recent=0.1, s.INC.recent.len=5, s.PREV.min=0.01, s.PREV.max=0.1,
epi.model='HPTN071', epi.dt=1/48, epi.import=NA,
v.N0tau=1, v.r=2.851904, v.T50=-2,
wher.mu=log(0.00447743)-0.3^2/2, wher.sigma=0.3, bwerm.mu=log(0.002239075)-0.13^2/2, bwerm.sigma=0.13, er.gamma=4,
dbg.GTRparam=0, dbg.rER=0, index.starttime.mode='fix', startseq.mode='sample')
# standard run, fixed GTR param + fixed relative rates for each transmission chain
# no WH Er to equal BH ER, and both being constant
pipeline.vary <- data.table( epi.import=c(0, 0.05, 0.1), label=c('-f4000','-f4005','-f4001'))
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='epi.import' ), 'v', as.character(epi.import))
print(pipeline.args)
# re-name the following:
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp140927'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
#
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
system(file)
}, by='label']
pipeline.args <- sim.regional.args( yr.start=1980, yr.end=2020, seed=42,
s.INC.recent=0.1, s.INC.recent.len=5, s.PREV.min=0.01, s.PREV.max=0.1,
epi.model='HPTN071', epi.dt=1/48, epi.import=NA,
v.N0tau=1, v.r=2.851904, v.T50=-2,
wher.mu=log(0.00447743)-0.3^2/2, wher.sigma=0.3, bwerm.mu=log(0.002239075)-0.13^2/2, bwerm.sigma=0.13, er.gamma=0,
dbg.GTRparam=1, dbg.rER=1, index.starttime.mode='normal', startseq.mode='sample')
# standard run, fixed GTR param + fixed relative rates for each transmission chain
# no WH Er to equal BH ER, and both being constant
pipeline.vary <- data.table( epi.import=c(0, 0.05, 0.1), label=c('-n0110','-n0115','-n0111'))
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='epi.import' ), 'v', as.character(epi.import))
print(pipeline.args)
# re-name the following:
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp140927'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
#
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
system(file)
}, by='label']
pipeline.args <- sim.regional.args( yr.start=1980, yr.end=2020, seed=42,
s.INC.recent=0.1, s.INC.recent.len=5, s.PREV.min=0.01, s.PREV.max=0.1,
epi.model='HPTN071', epi.dt=1/48, epi.import=NA,
v.N0tau=1, v.r=2.851904, v.T50=-2,
wher.mu=log(0.00447743)-0.3^2/2, wher.sigma=0.3, bwerm.mu=log(0.002239075)-0.13^2/2, bwerm.sigma=0.13, er.gamma=0,
dbg.GTRparam=0, dbg.rER=0, index.starttime.mode='normal', startseq.mode='sample')
# standard run, fixed GTR param + fixed relative rates for each transmission chain
# no WH Er to equal BH ER, and both being constant
pipeline.vary <- data.table( epi.import=c(0, 0.05, 0.1), label=c('-n0000','-n0005','-n0001'))
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='epi.import' ), 'v', as.character(epi.import))
print(pipeline.args)
# re-name the following:
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp140927'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
#
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
system(file)
}, by='label']
pipeline.args <- sim.regional.args( yr.start=1980, yr.end=2020, seed=42,
s.INC.recent=0.1, s.INC.recent.len=5, s.PREV.min=0.01, s.PREV.max=0.1,
epi.model='HPTN071', epi.dt=1/48, epi.import=NA,
v.N0tau=1, v.r=2.851904, v.T50=-2,
wher.mu=log(0.00447743)-0.3^2/2, wher.sigma=0.3, bwerm.mu=log(0.002239075)-0.13^2/2, bwerm.sigma=0.13, er.gamma=4,
dbg.GTRparam=0, dbg.rER=0, index.starttime.mode='normal', startseq.mode='sample')
# standard run, fixed GTR param + fixed relative rates for each transmission chain
# no WH Er to equal BH ER, and both being constant
pipeline.vary <- data.table( epi.import=c(0, 0.05, 0.1), label=c('-n4000','-n4005','-n4001'))
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='epi.import' ), 'v', as.character(epi.import))
print(pipeline.args)
# re-name the following:
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp140927'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
#
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
system(file)
}, by='label']
}
# final three on all scenarios
if(1)
{
indir <- system.file(package="PANGEA.HIV.sim", "misc")
pipeline.args <- sim.regional.args( yr.start=1980, yr.end=2020, seed=42,
s.INC.recent=0.1, s.INC.recent.len=5, s.PREV.min=0.01, s.PREV.max=NA,
epi.model='HPTN071', epi.dt=1/48, epi.import=0.05,
v.N0tau=1, v.r=2.851904, v.T50=-2,
wher.mu=log(0.00447743)-0.3^2/2, wher.sigma=0.3, bwerm.mu=log(0.002239075)-0.13^2/2, bwerm.sigma=0.13, er.gamma=NA,
dbg.GTRparam=NA, dbg.rER=NA, index.starttime.mode=NA, startseq.mode='sample')
# standard run, fixed GTR param + fixed relative rates for each transmission chain
# no WH Er to equal BH ER, and both being constant
pipeline.vary <- data.table( dbg.GTRparam=c(1,1,0),
dbg.rER=c(1,1,0),
index.starttime.mode=c('fix','normal','normal'),
er.gamma=c(4,4,0),
label=c('-f4115','-n4115','-n0005'))
dummy <- pipeline.vary[, {
set(pipeline.args, which( pipeline.args$stat=='dbg.GTRparam' ), 'v', as.character(dbg.GTRparam))
set(pipeline.args, which( pipeline.args$stat=='dbg.rER' ), 'v', as.character(dbg.rER))
set(pipeline.args, which( pipeline.args$stat=='index.starttime.mode' ), 'v', as.character(index.starttime.mode))
set(pipeline.args, which( pipeline.args$stat=='er.gamma' ), 'v', as.character(er.gamma))
print(pipeline.args)
# scenario A
infile.ind <- '180914_HPTN071_scA_rep1'
infile.trm <- '180914_HPTN071_scA_rep1'
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp140928a'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
set(pipeline.args, which( pipeline.args$stat=='s.PREV.max' ), 'v', '0.11')
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
system(file)
# scenario B
infile.ind <- '180914_HPTN071_scB_rep1'
infile.trm <- '180914_HPTN071_scB_rep1'
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp140928b'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
set(pipeline.args, which( pipeline.args$stat=='s.PREV.max' ), 'v', '0.15')
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
system(file)
# scenario C
infile.ind <- '180914_HPTN071_scC_rep1'
infile.trm <- '180914_HPTN071_scC_rep1'
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp140928c'
tmpdir <- paste(tmpdir,label,sep='')
dir.create(tmpdir, showWarnings=FALSE)
set(pipeline.args, which( pipeline.args$stat=='s.PREV.max' ), 'v', '0.185')
file.copy(paste(indir,'/',infile.ind,'_IND.csv',sep=''), paste(tmpdir,'/',infile.ind,label,'_IND.csv',sep=''))
file.copy(paste(indir,'/',infile.trm,'_TRM.csv',sep=''), paste(tmpdir,'/',infile.trm,label,'_TRM.csv',sep=''))
file <- sim.regional(tmpdir, paste(infile.ind,label,'_IND.csv',sep=''), paste(infile.trm,label,'_TRM.csv',sep=''), tmpdir, pipeline.args=pipeline.args)
system(file)
}, by='label']
}
}
##--------------------------------------------------------------------------------------------------------
## check simulated sequences: create ExaML tree and estimate R2
## olli 02.02.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.SSApg.CLUSTERBEAST.skygrid.hky<- function()
{
require(rBEAST)
tree.id.labelsep <- '|'
tree.id.label.idx.ctime <- 4
#select <- 'grid-mseq500'
#select <- 'grid-cseq3'
select <- 'grid-mseq400'
indir <- '/Users/Oliver/duke/2014_Gates/methods_comparison_pipeline/150414'
#indir <- '/Users/Oliver/git/HPTN071sim/tmp140914/140716_RUN001_INTERNAL'
outdir <- indir
infiles <- list.files(indir, '.*INTERNAL.R$', full.names=FALSE)
#stopifnot(length(infiles)==1)
#
# run
#
selects <- c( paste('grid-mseq',seq(600, 1200, 200), sep=''), paste('grid-clsmseq',seq(600, 1200, 200), sep=''), paste('grid-clrndseq',seq(600, 1200, 200), sep=''))
for(select in selects)
{
for(i in seq_along(infiles))
{
infile <- infiles[i]
# load simulated data
file <- paste(indir, '/', infile, sep='')
file.name <- paste(indir, gsub('_SIMULATED_INTERNAL.R',paste('_TEST_pol_HKY_fixedtree_',select,'.xml',sep=''),infile), sep='/')
cat(paste('\nLoading file', file))
load(file) #expect "df.epi" "df.trms" "df.inds" "df.sample" "df.seq"
set( df.seq, NULL, 'IDCLU', df.seq[, as.integer(IDCLU)] )
setnames(df.seq, c("LABEL", "IDCLU", "IDPOP"), c("TAXON_NAME", "CLU_ID", "TAXON_ID"))
#
seq.select <- beast.choose.seq.by.clusters(df.seq, select, verbose=1)
#
# read NEWICK trees for each cluster phylogeny, if there
#
phd <- NULL
tmp <- list.files(indir, '_DATEDTREE.newick$', full.names=FALSE)
tmp <- tmp[ grepl(substr(infile, 1, regexpr('_SIMULATED',infile)), tmp) ]
if(length(tmp))
{
# select
phd <- read.tree(paste(indir, tmp, sep='/'))
tmp2 <- data.table(TAXON_ID= sapply(phd, function(x) x$tip.label[1]), IDX=seq_along(phd))
set( tmp2, NULL, 'TAXON_ID', tmp2[, as.integer(substring(sapply(strsplit(TAXON_ID, tree.id.labelsep, fixed=TRUE),'[[',1),7)) ] )
tmp2 <- merge(subset(seq.select, select=c(TAXON_ID, CLU_ID)), tmp2, by='TAXON_ID')
phd <- lapply(tmp2[,IDX], function(i) phd[[i]] )
names(phd) <- tmp2[, CLU_ID]
# plot
phd.plot <- eval(parse(text=paste('phd[[',seq_along(phd),']]', sep='',collapse='+')))
#phd.plot <- drop.tip(phd.plot, which(grepl('NOEXIST', phd.plot$tip.label)), root.edge=1)
phd.plot <- ladderize(phd.plot)
tmp <- paste(indir, '/', gsub('DATEDTREE','BEASTDATEDTREE',tmp), sep='')
pdf(file=gsub('.xml','_BEASTDATEDTREE.pdf',file.name), w=10, h=Ntip(phd.plot)*0.1)
plot(phd.plot, show.tip=TRUE, cex=0.5)
dev.off()
}
#
# create BEAST XML POL
#
seq.select.pol <- subset(seq.select, select=c("CLU_ID", "TAXON_ID", "TAXON_NAME", "POL" ))
setnames(seq.select.pol, 'POL', 'SEQ')
bxml <- beastscript.multilocus.hky( file.name, seq.select.pol, phd, verbose=1 )
cat(paste("\nwrite xml file to",file.name))
saveXML(bxml, file=file.name)
}
}
}
##--------------------------------------------------------------------------------------------------------
## check simulated sequences: create ExaML tree and estimate R2
## olli 02.02.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.SSApg.CLUSTERBEAST.skygrid.codon.gtr<- function()
{
require(rBEAST)
tree.id.labelsep <- '|'
tree.id.label.idx.ctime <- 4
#select <- 'grid-mseq500'
select <- 'grid-cseq3'
select <- 'grid-mseq400'
indir <- '/Users/Oliver/duke/2014_Gates/methods_comparison_pipeline/150414'
#indir <- '/Users/Oliver/git/HPTN071sim/tmp140914/140716_RUN001_INTERNAL'
outdir <- indir
infiles <- list.files(indir, '.*INTERNAL.R$', full.names=FALSE)
#stopifnot(length(infiles)==1)
#
# run
#
selects <- c( paste('grid-mseq',seq(600, 1200, 200), sep=''), paste('grid-clsmseq',seq(600, 1200, 200), sep=''), paste('grid-clrndseq',seq(600, 1200, 200), sep=''))
selects <- paste('grid-mseq',seq(600, 1200, 200), sep='')
for(select in selects)
{
for(i in seq_along(infiles))
{
infile <- infiles[i]
# load simulated data
file <- paste(indir, '/', infile, sep='')
cat(paste('\nLoading file', file))
load(file) #expect "df.epi" "df.trms" "df.inds" "df.sample" "df.seq"
file.name <- paste(indir, gsub('_SIMULATED_INTERNAL.R',paste('_TEST_pol_CODON-GTR_fixedtree_',select,'.xml',sep=''),infile), sep='/')
set( df.seq, NULL, 'IDCLU', df.seq[, as.integer(IDCLU)] )
setnames(df.seq, c("LABEL", "IDCLU", "IDPOP"), c("TAXON_NAME", "CLU_ID", "TAXON_ID"))
#
seq.select <- beast.choose.seq.by.clusters(df.seq, select, verbose=1)
#
# read NEWICK trees for each cluster phylogeny, if there
#
phd <- NULL
tmp <- list.files(indir, '_DATEDTREE.newick$', full.names=FALSE)
tmp <- tmp[ grepl(substr(infile, 1, regexpr('_SIMULATED',infile)), tmp) ]
if(length(tmp))
{
# select
phd <- read.tree(paste(indir, tmp, sep='/'))
tmp2 <- data.table(TAXON_ID= sapply(phd, function(x) x$tip.label[1]), IDX=seq_along(phd))
set( tmp2, NULL, 'TAXON_ID', tmp2[, as.integer(substring(sapply(strsplit(TAXON_ID, tree.id.labelsep, fixed=TRUE),'[[',1),7)) ] )
tmp2 <- merge(subset(seq.select, select=c(TAXON_ID, CLU_ID)), tmp2, by='TAXON_ID')
phd <- lapply(tmp2[,IDX], function(i) phd[[i]] )
names(phd) <- tmp2[, CLU_ID]
# plot
phd.plot <- eval(parse(text=paste('phd[[',seq_along(phd),']]', sep='',collapse='+')))
#phd.plot <- drop.tip(phd.plot, which(grepl('NOEXIST', phd.plot$tip.label)), root.edge=1)
phd.plot <- ladderize(phd.plot)
pdf(file=gsub('.xml','_BEASTDATEDTREE.pdf',file.name), w=10, h=Ntip(phd.plot)*0.1)
plot(phd.plot, show.tip=TRUE, cex=0.5)
dev.off()
}
#
# create BEAST XML POL
#
seq.select.pol <- subset(seq.select, select=c("CLU_ID", "TAXON_ID", "TAXON_NAME", "POL" ))
setnames(seq.select.pol, 'POL', 'SEQ')
bxml <- beastscript.multilocus.codon.gtr( file.name, seq.select.pol, phd, verbose=1 )
cat(paste("\nwrite xml file to",file.name))
saveXML(bxml, file=file.name)
}
}
}
##--------------------------------------------------------------------------------------------------------
## check simulated sequences: create ExaML tree and estimate R2
## olli 04.02.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.SeqNo<- function()
{
indir <- '/Users/Oliver/duke/2014_Gates/methods_comparison_pipeline/150205'
dfi <- data.table(FILE=list.files(indir, '.*SIMULATED_INTERNAL.R$', full.names=FALSE))
dfi[, SC:= sapply(strsplit(FILE, '_'),'[[',3)]
dfi[, CONFIG:= sapply(strsplit(SC, '-'),'[[',2)]
set(dfi, NULL, 'SC', dfi[, sapply(strsplit(SC, '-'),'[[',1)])
dfs <- dfi[,{
file <- paste(indir, FILE, sep='/' )
cat(paste('\nprocess file=',file))
load(file)
df.inds
tmp2 <- subset( df.inds, DOD>pipeline.args['yr.end',][, as.numeric(v)] & floor(TIME_TR)<pipeline.args['yr.end',][, as.numeric(v)])
sc.alive20.infl20 <- tmp2[, length(which(!is.na(TIME_SEQ)))/length(TIME_SEQ) ]
tmp2 <- subset( df.inds, floor(TIME_TR)>=2000 & floor(TIME_TR)<pipeline.args['yr.end',][, as.numeric(v)])
sc.infg99l20 <- tmp2[, length(which(!is.na(TIME_SEQ)))/length(TIME_SEQ) ]
sn.g14 <- subset( df.inds, TIME_SEQ>=pipeline.args['s.INTERVENTION.start',][, as.numeric(v)] & floor(TIME_TR)<pipeline.args['yr.end',][, as.numeric(v)])[, length(which(!is.na(TIME_SEQ))) ]
sn.total <- subset(df.inds, floor(TIME_TR)<pipeline.args['yr.end',][, as.numeric(v)])[, length(which(!is.na(TIME_SEQ))) ]
list(SN_TOTAL=sn.total, SN_G14=sn.g14, SC_20=sc.alive20.infl20, SC_9920=sc.infg99l20)
}, by=c('SC','CONFIG')]
setkey(dfs, CONFIG, SC)
dfs[, PC_G14:= SN_G14/SN_TOTAL]
}
##--------------------------------------------------------------------------------------------------------
## check simulated sequences: create ExaML tree and estimate R2
## olli 04.02.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.Surveys<- function()
{
indir <- '/Users/Oliver/duke/2014_Gates/methods_comparison_pipeline/150205'
dfi <- data.table(FILE=list.files(indir, '.*SIMULATED_INTERNAL.R$', full.names=FALSE))
dfi[, SC:= sapply(strsplit(FILE, '_'),'[[',3)]
dfi[, CONFIG:= sapply(strsplit(SC, '-'),'[[',2)]
set(dfi, NULL, 'SC', dfi[, sapply(strsplit(SC, '-'),'[[',1)])
#
# complete census
#
df.sp <- dfi[,{
file <- paste(indir, FILE, sep='/' )
cat(paste('\nprocess file=',file))
load(file)
df.sp <- PANGEA.SeroPrevalenceSurvey(df.inds, epi.adult=13, s.INTERVENTION.start=2015, sp.prop.of.sexactive=1, sp.times=c(12, 8, 4, 0), test=1 )
df.sp[, list(ALIVE_N=sum(ALIVE_N), ALIVE_AND_HIV_N=sum(ALIVE_AND_HIV_N), ALIVE_AND_DIAG_N=sum(ALIVE_AND_DIAG_N), ALIVE_AND_ART_N=sum(ALIVE_AND_ART_N), ALIVE_AND_SEQ_N=sum(ALIVE_AND_SEQ_N)), by='YR']
}, by=c('SC','CONFIG')]
df.sp[, DIAG_PC:= ALIVE_AND_DIAG_N/ALIVE_AND_HIV_N]
df.sp[, ART_PC:= ALIVE_AND_ART_N/ALIVE_AND_HIV_N]
df.sp[, SEQ_PC:= ALIVE_AND_SEQ_N/ALIVE_AND_HIV_N]
df.sp <- melt( df.sp, id.vars=c('SC','CONFIG','YR'), measure.vars=c('DIAG_PC','ART_PC','SEQ_PC') )
ggplot(df.sp, aes(x=YR, y=value, colour=SC, group=SC)) + geom_point() + geom_line() +
scale_color_brewer(name='scenario', palette='Paired') +
scale_x_continuous(breaks=df.sp[, unique(YR)], expand=c(.2,.2)) +
facet_wrap(CONFIG~variable, scales='free_y', ncol=3) +
theme_bw() + labs(x='', y='proportion')
file <- paste(indir, '150205_TEST_SurveyComplete.pdf')
ggsave(file=file, w=12, h=6)
#
# survey on 5%
#
df.sp <- dfi[,{
file <- paste(indir, FILE, sep='/' )
cat(paste('\nprocess file=',file))
load(file)
df.sp
df.sp[, list(ALIVE_N=sum(ALIVE_N), ALIVE_AND_DIAG_N=sum(ALIVE_AND_DIAG_N), ALIVE_AND_ART_N=sum(ALIVE_AND_ART_N), ALIVE_AND_SEQ_N=sum(ALIVE_AND_SEQ_N)), by='YR']
}, by=c('SC','CONFIG')]
df.sp <- melt( df.sp, id.vars=c('SC','CONFIG','YR'), measure.vars=c('ALIVE_N','ALIVE_AND_DIAG_N','ALIVE_AND_ART_N','ALIVE_AND_SEQ_N') )
ggplot(df.sp, aes(x=YR, y=value, colour=SC, group=SC)) + geom_point() + geom_line() +
scale_color_brewer(name='scenario', palette='Paired') +
scale_x_continuous(breaks=df.sp[, unique(YR)], expand=c(.2,.2)) +
facet_wrap(CONFIG~variable, scales='free_y', ncol=4) +
theme_bw() + labs(x='', y='proportion')
file <- paste(indir, '150205_TEST_SurveySampled.pdf')
ggsave(file=file, w=14, h=7)
}
##--------------------------------------------------------------------------------------------------------
## check simulated sequences: create ExaML tree and estimate R2
## olli 03.02.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.PropCD4atDiag<- function()
{
#check time to coalescence in true dated tree
label.sep <- '|'
label.idx.date <- 4
label.idx.idpop <- 1
indir <- '/Users/Oliver/duke/2014_Gates/methods_comparison_pipeline/150205'
dfi <- data.table(FILE=list.files(indir, '.*SIMULATED_INTERNAL.R$', full.names=FALSE))
dfi[, SC:= sapply(strsplit(FILE, '_'),'[[',3)]
dfi[, CONFIG:= sapply(strsplit(SC, '-'),'[[',2)]
set(dfi, NULL, 'SC', dfi[, sapply(strsplit(SC, '-'),'[[',1)])
df <- dfi[, {
file <- paste(indir, FILE, sep='/' )
cat(paste('\nprocess',file))
load(file)
ans <- subset(df.inds, !is.na(DIAG_T) & DIAG_T<2020, c(TIME_SEQ, RECENT_TR, DIAG_T, DIAG_CD4 ))
tmp <- sample(seq_len(nrow(ans)), round(nrow(ans)*.5))
ans[, RECENT_TRr:= NA_character_]
set(ans, tmp, 'RECENT_TRr', ans[tmp, as.character(RECENT_TR)])
ans
}, by=c('SC', 'CONFIG')]
df[, DIAG_CD4c:= df[, cut(DIAG_CD4, breaks=c(0,200,350,500,700,2000))]]
set( df, NULL, 'RECENT_TRr', df[, factor(RECENT_TRr)] )
# plot CD4 counts at time of diagnosis
df <- subset(df, CONFIG%in%c('mFP25','mFP50','mFP85'))
ggplot(df, aes(x=floor(DIAG_T), fill=DIAG_CD4c)) + geom_bar(binwidth=1, position='fill', alpha=0.5) +
labs(fill='CD4 category', y='percent of diagnosed', x='year diagnosed') + scale_y_continuous(breaks=seq(0,1,0.1)) + scale_fill_brewer(palette='Set1') +
theme_bw() + theme(panel.grid.major.y=element_line(colour="black", size=0.2)) + facet_grid(SC~CONFIG)
file <- paste(indir, '150205_TEST_CD4atDiagnosis.pdf')
ggsave(file=file, w=12, h=12)
ggplot(subset(df, !is.na(TIME_SEQ)), aes(x=floor(DIAG_T), fill=DIAG_CD4c)) + geom_bar(binwidth=1, position='fill', alpha=0.5) +
labs(fill='CD4 category', y='percent of diagnosed\namong sequenced', x='year diagnosed') + scale_y_continuous(breaks=seq(0,1,0.1)) + scale_fill_brewer(palette='Set1') +
theme_bw() + theme(panel.grid.major.y=element_line(colour="black", size=0.2)) + facet_grid(SC~CONFIG)
file <- paste(indir, '150205_TEST_CD4atDiagnosisAmongSequenced.pdf')
ggsave(file=file, w=12, h=12)
# plot recent at time of diagnosis
ggplot(df, aes(x=floor(DIAG_T), fill=RECENT_TR)) + geom_bar(binwidth=1, position='fill', alpha=0.5) +
labs(fill='Infected in\nlast 6 months', y='percent of diagnosed', x='year diagnosed') + scale_y_continuous(breaks=seq(0,1,0.1)) + scale_fill_brewer(palette='Set1') +
theme_bw() + theme(panel.grid.major.y=element_line(colour="black", size=0.2)) + facet_grid(SC~CONFIG)
file <- paste(indir, '150205_TEST_RECENT.pdf')
ggsave(file=file, w=12, h=12)
ggplot(subset(df, !is.na(TIME_SEQ) & !is.na(RECENT_TRr)), aes(x=floor(DIAG_T), fill=RECENT_TRr)) + geom_bar(binwidth=1, position='fill', alpha=0.5) +
labs(fill='Infected in\nlast 6 months', y='percent of diagnosed\nfor whom recency known and sequenced', x='year diagnosed') + scale_y_continuous(breaks=seq(0,1,0.1)) + scale_fill_brewer(palette='Set1') +
theme_bw() + theme(panel.grid.major.y=element_line(colour="black", size=0.2)) + facet_grid(SC~CONFIG)
file <- paste(indir, '150205_TEST_RECENTAmongSequenced50.pdf')
ggsave(file=file, w=12, h=12)
}
##--------------------------------------------------------------------------------------------------------
## check simulated sequences: create ExaML tree and estimate R2
## olli 03.02.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.CoClusteringAcute<- function()
{
label.sep <- '|'
label.idx.date <- 4
label.idx.idpop <- 1
indir <- '/Users/Oliver/duke/2014_Gates/methods_comparison_pipeline/150205'
dfi <- data.table(FILE=list.files(indir, '.*SIMULATED_INTERNAL.R$', full.names=FALSE))
dfi[, SC:= sapply(strsplit(FILE, '_'),'[[',3)]
dfi[, CONFIG:= sapply(strsplit(SC, '-'),'[[',2)]
set(dfi, NULL, 'SC', dfi[, sapply(strsplit(SC, '-'),'[[',1)])
#
# true phylo
#
clu <- dfi[, {
#FILE<- dfi[2,FILE]
file <- paste(indir, FILE, sep='/' )
cat(paste('\nprocess',file))
load( file )
#
clu <- subset(df.inds, !is.na(TIME_SEQ))[, list(RECENT_Y=sum(length(which(RECENT_TR=='Y'))), SIZE=length(IDPOP) ), by='IDCLU']
subset(clu, SIZE>1)
}, by=c('SC','CONFIG')]
clu[, SIZEc:= cut(SIZE, breaks=c(0,4,10,1e3), labels=c('<=4','5-10','>10'))]
clup <- clu[, list(RECENT_P=mean(RECENT_Y/SIZE)), by=c('SC','CONFIG','SIZEc')]
ggplot(clup, aes(x=SIZEc, y=RECENT_P, group=SC, colour=SC)) + geom_point() + geom_line() + facet_grid(.~CONFIG) +
theme_bw() + scale_colour_brewer(palette='Paired', name='scenario') +
labs(x='Number of seqs in true dated cluster phylogeny', y='individuals within 6 mo of HIV infection\n(%)')
file <- paste(indir, '150205_TEST_ClustWithin6Mo_truephylo.pdf')
ggsave(file=file, w=8, h=6)
ggplot(subset(clup, grepl('s2x',CONFIG)), aes(x=SIZEc, y=RECENT_P, group=SC, colour=SC)) + geom_point() + geom_line() + facet_grid(.~CONFIG) +
theme_bw() + scale_colour_brewer(palette='Paired', name='scenario') +
labs(x='Number of seqs in true dated cluster phylogeny', y='individuals within 6 mo of HIV infection\n(%)')
#
# NJ tree
#
# load outgroup sequences
file <- system.file(package="PANGEA.HIV.sim", "misc",'PANGEA_SSAfg_HXB2outgroup.R')
cat(paste('\nLoading outgroup seq from file', file))
load(file) #expect "outgroup.seq.gag" "outgroup.seq.pol" "outgroup.seq.env"
clu <- dfi[, {
file <- paste(indir, FILE, sep='/' )
cat(paste('\nprocess',file))
load( file )
# concatenate sequences
tmp <- tolower(do.call('rbind',strsplit(df.seq[, paste(GAG,POL,ENV,sep='')],'')))
rownames(tmp) <- df.seq[, paste(IDCLU,label.sep,LABEL,sep='')]
seq <- as.DNAbin(tmp)
tmp <- cbind(outgroup.seq.gag[,seq_len(df.seq[1, nchar(GAG)])], outgroup.seq.pol, outgroup.seq.env)
seq <- rbind(seq,tmp)
# NJ tree
seq.ph <- nj(dist.dna(seq, model='raw'))
tmp <- which(seq.ph$tip.label=="HXB2")
seq.ph <- reroot(seq.ph, tmp, seq.ph$edge.length[which(seq.ph$edge[,2]==tmp)])
seq.ph <- ladderize(seq.ph)
# clustering at 4%
seq.brdist <- hivc.clu.brdist.stats(seq.ph, eval.dist.btw="leaf", stat.fun=hivc.clu.min.transmission.cascade)
seq.clu <- lapply(c(0.04, 0.06, 0.08), function(x)
{
clustering <- hivc.clu.clusterbythresh(seq.ph, thresh.brl=x, dist.brl=seq.brdist, retval="all")
seq.clu <- subset( data.table( PH_NODE_ID=seq_len(Ntip(seq.ph)), CLU_ID=clustering$clu.mem[ seq_len(Ntip(seq.ph))] ), !is.na(CLU_ID) )
seq.clu[, LABEL:= seq.clu[, seq.ph$tip.label[PH_NODE_ID]] ]
seq.clu[, PH_NODE_ID:=NULL]
seq.clu[, BRL:=x]
seq.clu
})
seq.clu <- do.call('rbind',seq.clu)
seq.clu[, IDPOP:=seq.clu[, as.integer(substring(sapply(strsplit(LABEL,label.sep,fixed=TRUE),'[[',2),7))]]
seq.clu <- merge(seq.clu, subset(df.inds, !is.na(TIME_SEQ), c(IDPOP, RECENT_TR)), by='IDPOP')
seq.clu <- seq.clu[, list(RECENT_Y=sum(length(which(RECENT_TR=='Y'))), SIZE=length(IDPOP) ), by=c('CLU_ID','BRL')]
setnames(seq.clu, 'CLU_ID','IDPHCLU')
seq.clu
}, by=c('SC','CONFIG')]
file <- paste(indir, '150205_TEST_ClustWithin6Mo_clusters.R')
save(file=file, clu)
clu[, SIZEc:= cut(SIZE, breaks=c(0,4,10,1e3), labels=c('<=4','5-10','>10'))]
clup <- clu[, list(RECENT_P=mean(RECENT_Y/SIZE)), by=c('SC','CONFIG','SIZEc','BRL')]
ggplot(clup, aes(x=SIZEc, y=RECENT_P, group=SC, colour=SC)) + geom_point() + geom_line() + facet_grid(BRL~CONFIG) +
theme_bw() + scale_colour_brewer(palette='Paired', name='scenario') +
labs(x='Number of seqs in true dated cluster phylogeny', y='individuals within 6 mo of HIV infection\n(%)')
file <- paste(indir, '150205_TEST_ClustWithin6Mo_clusters.pdf')
ggsave(file=file, w=8, h=10)
}
##--------------------------------------------------------------------------------------------------------
## check reproducibility
## olli 03.02.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.Reproducible<- function()
{
#require(devtools)
#devtools::install("/Users/Oliver/git/HPTN071sim/source/PANGEA.HIV.sim")
#check time to coalescence in true dated tree
label.sep <- '|'
label.idx.date <- 4
label.idx.idpop <- 1
file <- "/Users/Oliver/git/HPTN071sim/tmp150227-E-o111/150129_HPTN071_scE-o111_INTERNAL/150129_HPTN071_scE-o111_SIMULATED_INTERNAL.R"
load(file)
df.epi2 <- copy(df.epi)
df.trms2 <- copy(df.trms)
df.inds2 <- copy(df.inds)
df.sample2 <- copy(df.sample)
df.seq2 <- copy(df.seq)
df.sp2 <- copy(df.sp)
file <- "/Users/Oliver/git/HPTN071sim/tmp150227-E-o111-check/150129_HPTN071_scE-o111_INTERNAL/150129_HPTN071_scE-o111_SIMULATED_INTERNAL.R"
load(file)
all(df.epi[, PREV]==df.epi2[, PREV])
all(df.epi[, IMPORT]==df.epi2[, IMPORT])
all(df.sample[, s.nTOTAL]==df.sample2[, s.nTOTAL])
all(subset(df.inds, !is.na(TIME_SEQ))[, TIME_SEQ]==subset(df.inds2, !is.na(TIME_SEQ))[, TIME_SEQ])
all(subset(df.inds, !is.na(TIME_SEQ))[, IDCLU]==subset(df.inds2, !is.na(TIME_SEQ))[, IDCLU])
all(df.sp[, ALIVE_N]==df.sp2[, ALIVE_N])
all(df.sp[, ALIVE_AND_SEQ_N]==df.sp2[, ALIVE_AND_SEQ_N])
all(df.seq[, IDPOP]==df.seq2[, IDPOP])
all(df.seq[, GAG]==df.seq2[, GAG])
all(df.seq[, POL]==df.seq2[, POL])
all(df.seq[, ENV]==df.seq2[, ENV])
#
#
#
file1 <- '/Users/Oliver/git/HPTN071sim/tmp150227-E-o111 copy/150129_HPTN071_scE-o111_INTERNAL/150129_HPTN071_scE-o111_SIMULATED_INTERNAL.R'
file2 <- '/Users/Oliver/git/HPTN071sim/tmp150227-E-o111/150129_HPTN071_scE-o111_INTERNAL/150129_HPTN071_scE-o111_SIMULATED_INTERNAL.R'
load(file2)
df.epi2 <- copy(df.epi)
df.trms2 <- copy(df.trms)
df.inds2 <- copy(df.inds)
df.sample2 <- copy(df.sample)
df.seq2 <- copy(df.seq)
df.sp2 <- copy(df.sp)
load(file1)
all(df.seq[, IDPOP]==df.seq2[, IDPOP])
all(df.seq[, GAG]==df.seq2[, GAG])
all(df.seq[, POL]==df.seq2[, POL])
all(df.seq[, ENV]==df.seq2[, ENV])
ph1 <- read.tree('/Users/Oliver/git/HPTN071sim/tmp150227-E-o111-check/150129_HPTN071_scE-o111_SIMULATED_TREE/150129_HPTN071_scE-o111_DATEDTREE.newick')
ph2 <- read.tree('/Users/Oliver/git/HPTN071sim/tmp150227-E-o111/150129_HPTN071_scE-o111_SIMULATED_TREE/150129_HPTN071_scE-o111_DATEDTREE.newick')
setequal(ph1$tip.label,ph2$tip.label)
tmp <- merge( data.table(LABEL=ph1$tip.label, H1=node.depth.edgelength(ph1)[seq_len(Ntip(ph1))]),
data.table(LABEL=ph2$tip.label, H2=node.depth.edgelength(ph2)[seq_len(Ntip(ph2))]), by='LABEL')
subset(tmp, round(H1,d=3)!=round(H2,d=3))
ph1 <- read.tree('/Users/Oliver/git/HPTN071sim/tmp150227-E-o111/150129_HPTN071_scE-o111_SIMULATED_TREE/150129_HPTN071_scE-o111_SUBSTTREE.newick')
ph2 <- read.tree('/Users/Oliver/git/HPTN071sim/tmp150227-E-o111 copy/150129_HPTN071_scE-o111_SIMULATED_TREE/150129_HPTN071_scE-o111_SUBSTTREE.newick')
setequal(ph1$tip.label,ph2$tip.label)
tmp <- merge( data.table(LABEL=ph1$tip.label, H1=node.depth.edgelength(ph1)[seq_len(Ntip(ph1))]),
data.table(LABEL=ph2$tip.label, H2=node.depth.edgelength(ph2)[seq_len(Ntip(ph2))]), by='LABEL')
subset(tmp, round(H1,d=3)!=round(H2,d=3))
load('/Users/Oliver/git/HPTN071sim/tmp150227-E-o111 copy/SeqGen/150129_HPTN071_scE-o111_seqgen.R')
df.seqgen2 <- copy(df.seqgen)
load('/Users/Oliver/git/HPTN071sim/tmp150227-E-o111/SeqGen/150129_HPTN071_scE-o111_seqgen.R')
setnames(df.seqgen2, 'NEWICK', 'NEWICK2')
tmp <- merge(subset(df.seqgen, GENE=='GAG' & CODON_POS=='CP1', select=c(IDCLU, NEWICK)), subset(df.seqgen2, GENE=='GAG' & CODON_POS=='CP1', select=c(IDCLU, NEWICK2)), by='IDCLU')
for(clu in tmp[,IDCLU])
{
clu<-1
ph1 <- subset(tmp, IDCLU==clu)[, read.tree(text=NEWICK)]
ph2 <- subset(tmp, IDCLU==clu)[, read.tree(text=NEWICK2)]
stopifnot( setequal(ph1$tip.label,ph2$tip.label) )
xx <- merge( data.table(LABEL=ph1$tip.label, H1=node.depth.edgelength(ph1)[seq_len(Ntip(ph1))]),
data.table(LABEL=ph2$tip.label, H2=node.depth.edgelength(ph2)[seq_len(Ntip(ph2))]), by='LABEL')
subset(xx, round(H1,d=4)!=round(H2,d=4))
}
x1 <- data.table(FILE=list.files('/Users/Oliver/git/HPTN071sim/tmp150227-E-o111 copy 2/VirusTreeSimulator', full.names=1))
x2 <- data.table(FILE=list.files('/Users/Oliver/git/HPTN071sim/tmp150227-E-o111 copy/VirusTreeSimulator', full.names=1))
x1[, B:= basename(FILE)]
x2[, B:= basename(FILE)]
x1 <- merge(x1,x2,by='B')
for(i in seq_len(nrow(x1)))
{
print(i)
ph1 <- hivc.beast2out.read.nexus.and.stats(x1[i, FILE.x], method.node.stat='any.node')$tree
ph2 <- hivc.beast2out.read.nexus.and.stats(x1[i, FILE.y], method.node.stat='any.node')$tree
ph1 <- seq.collapse.singles(ph1)
ph2 <- seq.collapse.singles(ph2)
stopifnot( setequal(ph1$tip.label,ph2$tip.label) )
if(Nnode(ph1))
{
xx <- merge( data.table(LABEL=ph1$tip.label, H1=node.depth.edgelength(ph1)[seq_len(Ntip(ph1))]),
data.table(LABEL=ph2$tip.label, H2=node.depth.edgelength(ph2)[seq_len(Ntip(ph2))]), by='LABEL')
stopifnot( nrow(subset(xx, round(H1,d=4)!=round(H2,d=4)))==0 )
}
if(!Nnode(ph1))
stopifnot( ph1$root.edge==ph2$root.edge )
}
}
##--------------------------------------------------------------------------------------------------------
## check simulated sequences: create ExaML tree and estimate R2
## olli 03.02.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.PropAcute.Village<- function()
{
# add dated tree files
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201502/Village'
dfi <- data.table(FILE=list.files(indir, '.*5yr.nex$', full.names=TRUE, recursive=TRUE))
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201502/Village_unblinded/Post-Dynamics-Info/Vill_1-4_TrueTrees'
dfi <- rbind(dfi, data.table(FILE=list.files(indir, 'nex$', full.names=TRUE, recursive=TRUE)))
dfi[, SC:= substr(gsub('Vill_','',regmatches(FILE,regexpr('Vill_[0-9]+_', FILE))),1,2)]
# add basic epi information
tmp <- data.table( SC= c('00','01','02','03','04','05','06','07','08','09','10','11'),
SCL= c('Intervention=none, Acute=low',
'Intervention=fast, Acute=high',
'Intervention=slow, Acute=low',
'Intervention=fast, Acute=low',
'Intervention=slow, Acute=high',
'Intervention=fast, Acute=low',
'Intervention=fast, Acute=high',
'Intervention=slow, Acute=high',
'Intervention=slow, Acute=low',
'Intervention=slow, Acute=low',
'Intervention=slow, Acute=high',
'Intervention=fast, Acute=low'))
dfi <- merge(dfi, tmp, by='SC')
dfi[, INT:= regmatches(SCL,regexpr('Intervention=[a-z]+',SCL))]
set(dfi, NULL, 'INT', dfi[, factor(INT, labels=c('Intervention Scale Up\nfast','Intervention Scale Up\nslow','Intervention Scale Up\nnone'), levels=c('Intervention=fast','Intervention=slow','Intervention=none'))])
dfi[, AC:= regmatches(SCL,regexpr('Acute=[a-z]+',SCL))]
# add transmission line list files
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201502/Village_unblinded/Transmission-Lists'
tmp <- data.table(FILE_LIST=list.files(indir, '.csv$', full.names=TRUE, recursive=TRUE))
tmp[, SC:= gsub('Vill_','',regmatches(FILE_LIST,regexpr('Vill_[0-9]+', FILE_LIST)))]
dfi <- merge(dfi, tmp, by='SC')
#
# get generation time **TO** new infected
#
trms <- do.call('rbind',lapply(seq_len(nrow(dfi)), function(i)
{
tmp <- dfi[i, FILE_LIST]
df.trms <- subset(as.data.table(read.csv(tmp)), EventType=='INFECTION')
df.trms[, EventType:=NULL]
setnames(df.trms, c('ActionTime','FromDeme.FromHost','ToDeme.ToHost'), c('TIME_TR','IDTR','IDREC'))
tmp <- unique(subset(df.trms, select=c(TIME_TR, IDREC)))
setnames(tmp, c('TIME_TR','IDREC'), c('TR_INF','IDTR'))
df.trms <- merge(df.trms, tmp, by='IDTR', all.x=1)
df.trms <- subset(df.trms, !is.na(IDTR)) #ignore index case
df.trms[, T2REC:= TIME_TR-TR_INF]
ans <- subset(df.trms, select=c('IDREC','IDTR','T2REC','TIME_TR'))
ans[, SC:= dfi[i, SC]]
ans[, SCL:= dfi[i, SCL]]
ans[, INT:= dfi[i, INT]]
ans[, AC:= dfi[i, AC]]
ans
}))
#
# get genetic distance between sampled individuals
#
gds <- do.call('rbind',lapply(seq_len(nrow(dfi)), function(i)
{
#FILE <- '/Users/Oliver/Dropbox (SPH Imperial College)/PANGEAHIVsim/201502/Village_unblinded/Post-Dynamics-Info/Vill_1-4_TrueTrees/Vill_03_Feb2015.nex'
tmp <- dfi[i, FILE]
ph <- read.nexus(tmp)
tmp <- cophenetic.phylo(ph)
diag(tmp) <- Inf
gds <- data.table(LABEL=rownames(tmp), LABEL_CL=colnames(tmp)[apply(tmp, 1, which.min)])
gds <- merge(gds, gds[, list(BRL=tmp[LABEL, LABEL_CL]), by='LABEL'], by='LABEL')
gds[, IDREC:= sapply(strsplit(LABEL,'_',fixed=T),'[[',1),]
gds[, TIME_SEQ:=as.numeric(sapply(strsplit(LABEL,'_',fixed=T),'[[',3))]
gds[, SC:= dfi[i, SC]]
gds[, SCL:= dfi[i, SCL]]
gds[, INT:= dfi[i, INT]]
gds[, AC:= dfi[i, AC]]
gds
}))
#
set(gds, NULL, c('LABEL','LABEL_CL'), NULL)
# get info on who sampled
trms <- merge(trms, gds, by=c('SC','SCL','INT','AC','IDREC'), all=1)
setnames(trms, 'TIME_SEQ', 'SAMPLED_REC')
tmp <- subset(trms, select=c(SC, IDREC, SAMPLED_REC) )
setnames(tmp, c('IDREC','SAMPLED_REC'), c('IDTR','SAMPLED_TR'))
trms <- merge(trms, tmp, by=c('SC','IDTR'), all.x=1)
#
# plot CDF among all sequenced infected by data set
#
trms[, PERIOD_TR:= trms[, cut(TIME_TR, breaks=c(0,20,40,Inf), labels=c('<=20','20-40','>=40'))]]
ggplot(subset(trms, SCL!="Intervention=none, Acute=low" & TIME_TR>20), aes(x=T2REC, colour=SCL, alpha=SC)) + stat_ecdf() +
facet_grid(PERIOD_TR~INT) +
labs(x='\ntime to transmission\nto all sampled individuals\n(years)',y='empirical CDF\n') +
#scale_color_brewer(name='simulation data sets', palette='Paired') +
scale_color_manual(values=c("#A6CEE3","#1F78B4","#FB9A99","#E31A1C")) +
scale_x_continuous(expand=c(0,0)) +
coord_cartesian(xlim=c(0,20)) +
scale_y_continuous(expand=c(0,0)) +
scale_alpha_manual(values=rep(1,trms[, length(unique(SC))]), guide = FALSE) +
theme_bw() + theme(legend.position='bottom')
outdir <- '~/Dropbox (SPH Imperial College)/2015_PANGEA_MCE_manuscript/figures'
file <- file.path(outdir, '150715_V_Time2TransmissionByACSC.pdf')
ggsave(file=file, w=7, h=7)
ggplot(subset(trms, !is.na(SAMPLED_REC) & SCL!="Intervention=none, Acute=low" & TIME_TR>20), aes(x=T2REC, colour=SCL, alpha=SC)) + stat_ecdf() +
facet_grid(PERIOD_TR~INT) +
labs(x='\ntime to transmission\nto all sampled individuals\n(years)',y='empirical CDF\n') +
#scale_color_brewer(name='simulation data sets', palette='Paired') +
scale_color_manual(values=c("#A6CEE3","#1F78B4","#FB9A99","#E31A1C")) +
scale_x_continuous(expand=c(0,0)) +
scale_y_continuous(expand=c(0,0)) +
coord_cartesian(xlim=c(0,20)) +
scale_alpha_manual(values=rep(1,trms[, length(unique(SC))]), guide = FALSE) +
theme_bw() + theme(legend.position='bottom')
outdir <- '~/Dropbox (SPH Imperial College)/2015_PANGEA_MCE_manuscript/figures'
file <- file.path(outdir, '150715_V_Time2TransmissionSequencedByACSC.pdf')
ggsave(file=file, w=7, h=7)
#
# plot CDF among all infected by data set
#
ggplot(subset(trms, !is.na(SAMPLED_REC) & TIME_TR>20 & SCL!="Intervention=none, Acute=low" ), aes(x=BRL*0.0022, colour=SCL, alpha=SC)) + stat_ecdf() +
facet_grid(PERIOD_TR~INT) +
#labs(x='\ntime spent diverging\nbetween phylogenetically closest individuals',y='empirical CDF') +
labs(x='\ngenetic distance\nbetween phylogenetically closest individuals\n(subst/site/year)',y='empirical CDF\n') +
#scale_color_brewer(name='data sets', palette='Paired') +
scale_color_manual(values=c("#A6CEE3","#1F78B4","#FB9A99","#E31A1C")) +
scale_x_continuous(expand=c(0,0)) +
scale_y_continuous(expand=c(0,0)) +
scale_alpha_manual(values=rep(1,trms[, length(unique(SC))]), guide = FALSE) +
theme_bw() + theme(legend.position='bottom')
file <- file.path(outdir, '150715_V_GeneticDistanceSequencedRecByACSC.pdf')
ggsave(file=file, w=7, h=7)
}
##--------------------------------------------------------------------------------------------------------
## check simulated sequences: create ExaML tree and estimate R2
## olli 03.02.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.PropAcute.Regional<- function()
{
#check time to coalescence in true dated tree
label.sep <- '|'
label.idx.date <- 4
label.idx.idpop <- 1
# get blinding codes
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201502/Regional_unblinded'
file <- paste(indir, '/answers_Regional_Feb2015_rFormat.csv', sep='')
dfi.code <- read.csv(file.path(indir, 'answers_Regional_Feb2015_rFormat.csv'))
dfi.code <- subset(dfi.code, select=c('SC','CONFIG','SC_RND','GSUB_FROM'))
setnames(dfi.code, c('SC','SC_RND'), c('SC_TRL','SC'))
# add dated tree files
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201502/Regional'
dfi <- data.table(FILE=list.files(indir, '.*DATEDTREE.newick$', full.names=FALSE, recursive=TRUE))
dfi[, SC:= regmatches(FILE,regexpr('sc[A-Z]', FILE))]
set(dfi, NULL, 'SC', dfi[, sapply(strsplit(SC, '-'),'[[',1)])
set(dfi, NULL, 'SC', dfi[, substring(SC,3)])
dfi <- merge(dfi, dfi.code, by='SC')
tmp <- data.table(SC_TRL=c('scA','scB','scC','scD','scE','scF'),SCL=c('Intervention=fast, Acute=low','Intervention=fast, Acute=high','Intervention=slow, Acute=low','Intervention=slow, Acute=high','Intervention=none, Acute=low','Intervention=none, Acute=high'))
dfi <- merge(dfi, tmp, by='SC_TRL')
# add INTERNAL files
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201502/Regional_unblinded'
dfi.internal <- data.table(FILE_INTRNL=list.files(indir, '.*SIMULATED_INTERNAL.R$', full.names=TRUE, recursive=TRUE))
dfi.internal[, GSUB_FROM:= regmatches(FILE_INTRNL, regexpr('sc[A-Z]-[a-zA-Z0-9]+',FILE_INTRNL))]
dfi <- merge(dfi, dfi.internal, by='GSUB_FROM')
# add metadata files
indir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201502/Regional'
dfi.meta <- data.table(FILE_META=list.files(indir, '.*metadata.csv$', full.names=TRUE, recursive=TRUE))
dfi.meta[, SC:= gsub('sc','',regmatches(FILE_META,regexpr('sc[A-Z]', FILE_META)))]
dfi <- merge(dfi, dfi.meta, by='SC')
dfi[, INT:= regmatches(SCL,regexpr('Intervention=[a-z]+',SCL))]
set(dfi, NULL, 'INT', dfi[, factor(INT, labels=c('Intervention\nScale Up\nfast','Intervention\nScale Up\nslow','Intervention\nScale Up\nnone'), levels=c('Intervention=fast','Intervention=slow','Intervention=none'))])
dfi[, AC:= regmatches(SCL,regexpr('Acute=[a-z]+',SCL))]
#
# get generation time **TO** new infected
#
trms <- do.call('rbind',lapply(seq_len(nrow(dfi)), function(i)
{
tmp <- dfi[i, FILE_INTRNL]
load(tmp)
df.trms <- subset(df.trms, IDTR>0)
df.trms[, T2REC:= TIME_TR-IDTR_TIME_INFECTED]
ans <- subset(df.trms, select=c('IDREC','T2REC','TIME_TR','TR_ACUTE','SAMPLED_REC','SAMPLED_TR'))
ans[, SC:= dfi[i, SC]]
ans[, SCL:= dfi[i, SCL]]
ans[, CONFIG:= dfi[i, CONFIG]]
ans[, INT:= dfi[i, INT]]
ans[, AC:= dfi[i, AC]]
ans
}))
trms[, PERIOD_TR:= trms[, cut(TIME_TR, breaks=c(1980,2004,2014,Inf), labels=c('<=2004','2005-2014','>=2015'))]]
#
# get genetic distance between sampled individuals
#
gds <- do.call('rbind',lapply(seq_len(nrow(dfi)), function(i)
{
tmp <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201502/Regional'
#file <- file.path(tmp,'150129_PANGEAsim_Regional_SecondObj_scH_SIMULATED_DATEDTREE/150129_PANGEAsim_Regional_SecondObj_scH_DATEDTREE.newick')
file <- paste(tmp, dfi[i, FILE], sep='/' )
cat(paste('\nprocess',file))
phd <- read.tree(file)
singletons.n <- length(which(sapply(phd, Ntip)==1))
phd <- phd[ which(sapply(phd, Ntip)>1) ]
# sampling times of tips and node.depth of tips for every tree
tmp <- lapply(seq_along(phd), function(k)
{
ph <- phd[[k]]
tmp <- cophenetic.phylo(ph)
diag(tmp) <- Inf
ans <- data.table(LABEL=rownames(tmp), LABEL_CL=colnames(tmp)[apply(tmp, 1, which.min)])
ans <- merge(ans, ans[, list(BRL_TIME=tmp[LABEL, LABEL_CL]), by='LABEL'], by='LABEL')
ans[, CLU_IDX:=k]
ans[, IDPOP:= as.integer(substring(sapply(strsplit(LABEL,label.sep,fixed=T),'[[',label.idx.idpop),7)),]
ans[, TIME_SEQ:=as.numeric(sapply(strsplit(LABEL,label.sep,fixed=T),'[[',label.idx.date))]
ans
})
gds <- do.call('rbind',tmp)
# calculate raw genetic distances between taxon and phylogenetically closest taxon
tmp <- dfi[i, FILE_INTRNL]
load(tmp)
# generate simulated sequences
tmp <- tolower(do.call('rbind',strsplit(df.seq[, paste(GAG,POL,ENV,sep='')],'')))
rownames(tmp) <- df.seq[, LABEL]
seq <- as.DNAbin(tmp)
# calculate pairwise distances
tmp <- gds[, list(GD=as.numeric(dist.dna(seq[c(LABEL, LABEL_CL),], model='raw'))), by=c('LABEL','LABEL_CL')]
gds <- merge(gds, tmp, by=c('LABEL','LABEL_CL'))
gds[, SC:= dfi[i, SC]]
gds[, SCL:= dfi[i, SCL]]
gds[, CONFIG:= dfi[i, CONFIG]]
gds[, INT:= dfi[i, INT]]
gds[, AC:= dfi[i, AC]]
gds
}))
setnames(gds, 'IDPOP', 'IDREC')
tmp <- subset(trms, !is.na(SAMPLED_REC), select=c(SC, IDREC, TR_ACUTE, TIME_TR, SAMPLED_REC, SAMPLED_TR))
gds <- merge(gds, tmp, by=c('SC','IDREC')) # loose some that have a transmitter from outside
gds[, PERIOD_TR:= gds[, cut(TIME_TR, breaks=c(1980,2004,2014,Inf), labels=c('<=2004','2005-2014','>=2015'))]]
gds[, PERIOD_SEQ:= gds[, cut(TIME_SEQ, breaks=c(1980,2004,2014,Inf), labels=c('<=2004','2005-2014','>=2015'))]]
#
# get coalescent times of sampled individuals
#
coal <- do.call('rbind',lapply(seq_len(nrow(dfi)), function(i)
{
tmp <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim/201502/Regional'
#FILE <- '150129_PANGEAsim_Regional_SecondObj_scH_SIMULATED_DATEDTREE/150129_PANGEAsim_Regional_SecondObj_scH_DATEDTREE.newick'
file <- paste(tmp, dfi[i, FILE], sep='/' )
cat(paste('\nprocess',file))
phd <- read.tree(file)
singletons.n <- length(which(sapply(phd, Ntip)==1))
phd <- phd[ which(sapply(phd, Ntip)>1) ]
# sampling times of tips and node.depth of tips for every tree
tmp <- lapply(seq_along(phd), function(k)
{
ph <- phd[[k]]
ans <- data.table( LABEL=ph$tip.label, CLU_IDX=k,
IDPOP= as.integer(substring(sapply(strsplit(ph$tip.label,label.sep,fixed=T),'[[',label.idx.idpop),7)),
TIME_SEQ=as.numeric(sapply(strsplit(ph$tip.label,label.sep,fixed=T),'[[',label.idx.date)))
ans[, BRL:=ph$edge.length[ sort(ph$edge[,2],index.return=T)$ix ][ seq_len(Ntip(ph)) ] ]
})
coal <- do.call('rbind',tmp)
coal[, SC:= dfi[i, SC]]
coal[, SCL:= dfi[i, SCL]]
coal[, CONFIG:= dfi[i, CONFIG]]
coal[, INT:= dfi[i, INT]]
coal[, AC:= dfi[i, AC]]
coal
}))
setnames(coal, 'IDPOP', 'IDREC')
tmp <- subset(trms, !is.na(SAMPLED_REC), select=c(SC, IDREC, TR_ACUTE, TIME_TR, SAMPLED_REC, SAMPLED_TR))
coal <- merge(coal, tmp, by=c('SC','IDREC')) # loose some that have a transmitter from outside
coal[, PERIOD_TR:= coal[, cut(TIME_TR, breaks=c(1980,2004,2014,Inf), labels=c('<=2004','2005-2014','>=2015'))]]
coal[, PERIOD_SEQ:= coal[, cut(TIME_SEQ, breaks=c(1980,2004,2014,Inf), labels=c('<=2004','2005-2014','>=2015'))]]
#
# plot CDF among all infected by acute / non-acute infection
#
ggplot(subset(trms, TIME_TR>2005), aes(x=T2REC, colour=TR_ACUTE)) + stat_ecdf() +
facet_grid(INT+AC+SC~PERIOD_TR) +
labs(x='\ntime to transmission\n(years)',y='empirical CDF\n') +
scale_color_brewer(name='transmission from acute', palette='Paired') +
scale_x_continuous(expand=c(0,0)) +
scale_y_continuous(expand=c(0,0)) +
coord_cartesian(xlim=c(0,20)) +
theme_bw() + theme(legend.position='bottom')
outdir <- '~/Dropbox (SPH Imperial College)/2015_PANGEA_MCE_manuscript/figures'
file <- file.path(outdir, '150715_R_Time2TransmissionByAcute.pdf')
ggsave(file=file, w=5, h=12)
# plot CDF among all infected by data set
ggplot(subset(trms, TIME_TR>2005), aes(x=T2REC, colour=SCL, alpha=CONFIG)) + stat_ecdf() +
facet_grid(PERIOD_TR~INT) +
labs(x='\ntime to transmission\nto all infected individuals\n(years)',y='empirical CDF\n') +
scale_color_brewer(name='simulation data sets', palette='Paired') +
scale_x_continuous(expand=c(0,0)) +
scale_y_continuous(expand=c(0,0)) +
coord_cartesian(xlim=c(0,20)) +
scale_alpha_manual(values=rep(1,trms[, length(unique(CONFIG))]), guide = FALSE) +
theme_bw() + theme(legend.position='bottom')
file <- file.path(outdir, '150715_R_Time2TransmissionByACSC.pdf')
ggsave(file=file, w=10, h=7)
# plot CDF among all sequenced infected by data set
ggplot(subset(trms, !is.na(SAMPLED_REC) & TIME_TR>2005), aes(x=T2REC, colour=SCL, alpha=CONFIG)) + stat_ecdf() +
facet_grid(PERIOD_TR~INT) +
labs(x='\ntime to transmission\nto all sampled individuals\n(years)',y='empirical CDF\n') +
scale_color_brewer(name='simulation data sets', palette='Paired') +
scale_x_continuous(expand=c(0,0)) +
scale_y_continuous(expand=c(0,0)) +
coord_cartesian(xlim=c(0,20)) +
scale_alpha_manual(values=rep(1,trms[, length(unique(CONFIG))]), guide = FALSE) +
theme_bw() + theme(legend.position='bottom')
file <- file.path(outdir, '150715_R_Time2TransmissionSequencedByACSC.pdf')
ggsave(file=file, w=10, h=7)
# plot CDF among all sequenced infected by data set
ggplot(subset(trms, !is.na(SAMPLED_REC) & TIME_TR>2005), aes(x=T2REC, colour=SCL, alpha=CONFIG)) + stat_ecdf() +
facet_grid(.~INT) +
labs(x='\ntime to transmission\nto all sampled individuals\n(years)',y='empirical CDF\n') +
scale_color_brewer(name='simulation data sets', palette='Paired') +
scale_x_continuous(expand=c(0,0)) +
scale_y_continuous(expand=c(0,0)) +
coord_cartesian(xlim=c(0,20)) +
scale_alpha_manual(values=rep(1,trms[, length(unique(CONFIG))]), guide = FALSE) +
theme_bw() + theme(legend.position='bottom')
file <- file.path(outdir, '150715_R_Time2TransmissionSequencedByACSC_noPERIOD.pdf')
ggsave(file=file, w=10, h=5)
#
# gds
#
#ggplot(subset(gds, TIME_TR>2005), aes(x=BRL*0.0022, colour=SCL, alpha=CONFIG)) + stat_ecdf() +
ggplot(subset(gds, TIME_TR>2005), aes(x=GD, colour=SCL, alpha=CONFIG)) + stat_ecdf() +
facet_grid(PERIOD_TR~INT) +
#labs(x='\ntime spent diverging\nbetween phylogenetically closest individuals',y='empirical CDF') +
labs(x='\ngenetic distance\nbetween phylogenetically closest individuals\n(subst/site/year)',y='empirical CDF\n') +
scale_color_brewer(name='data sets', palette='Paired') +
scale_x_continuous(expand=c(0,0)) +
coord_cartesian(xlim=c(0,0.15)) +
scale_y_continuous(expand=c(0,0)) +
scale_alpha_manual(values=rep(1,trms[, length(unique(CONFIG))]), guide = FALSE) +
theme_bw() + theme(legend.position='bottom')
file <- file.path(outdir, '150715_R_GeneticDistanceSequencedRecByACSC2.pdf')
ggsave(file=file, w=10, h=7)
#
ggplot(subset(gds, TIME_TR>2005), aes(x=GD, colour=SCL, alpha=CONFIG)) + stat_ecdf() +
facet_grid(.~INT) +
#labs(x='\ntime spent diverging\nbetween phylogenetically closest individuals',y='empirical CDF') +
labs(x='\ngenetic distance between phylogenetically closest individuals (subst/site)',y='empirical CDF\n') +
scale_color_brewer(name='data sets', palette='Paired') +
scale_x_continuous(expand=c(0,0)) +
coord_cartesian(xlim=c(0,0.15)) +
scale_y_continuous(expand=c(0,0)) +
scale_alpha_manual(values=rep(1,trms[, length(unique(CONFIG))]), guide = FALSE) +
theme_bw() + theme(legend.position='bottom')
file <- file.path(outdir, '150715_R_GeneticDistanceSequencedRecByACSC2_noPERIOD.pdf')
ggsave(file=file, w=10, h=5)
#
# coal
#
ggplot(subset(coal, TIME_TR>2005), aes(x=BRL, colour=SCL, alpha=CONFIG)) + stat_ecdf() +
facet_grid(PERIOD_TR~INT) +
#labs(x='\ntime spent diverging\nbetween phylogenetically closest individuals',y='empirical CDF') +
labs(x='\ntime to coalescence\nof all individuals\n(years)',y='empirical CDF\n') +
scale_color_brewer(name='data sets', palette='Paired') +
scale_x_continuous(expand=c(0,0)) +
scale_y_continuous(expand=c(0,0)) +
coord_cartesian(xlim=c(0,35)) +
scale_alpha_manual(values=rep(1,trms[, length(unique(CONFIG))]), guide = FALSE) +
theme_bw() + theme(legend.position='bottom')
file <- file.path(outdir, '150715_R_Time2CoalescenceSequencedRecByACSC.pdf')
ggsave(file=file, w=10, h=7)
#
# proportion recent infections
#
meta <- dfi[, {
#FILE_META<- '/Users/Oliver/Dropbox (SPH Imperial College)/PANGEAHIVsim/201502/Regional/150129_PANGEAsim_Regional_SecondObj_scT_SIMULATED_DATEDTREE/150129_PANGEAsim_Regional_SecondObj_scT_SIMULATED_metadata.csv'
subset(as.data.table(read.csv(FILE_META)), select=c(IDPOP, RECENT_TR))
}, by=c('SC','SCL', 'CONFIG', 'FILE_META')]
setnames(meta, 'IDPOP', 'IDREC')
tmp <- subset(trms, select=c(SC, IDREC, TR_ACUTE, TIME_TR, SAMPLED_REC, SAMPLED_TR))
meta <- merge(meta, tmp, by=c('SC','IDREC')) # loose some that have a transmitter from outside
meta[, INT:= regmatches(SCL,regexpr('Intervention=[a-z]+',SCL))]
set(meta, NULL, 'INT', meta[, factor(INT, labels=c('Intervention\nScale Up\nfast','Intervention\nScale Up\nslow','Intervention\nScale Up\nnone'), levels=c('Intervention=fast','Intervention=slow','Intervention=none'))])
meta[, AC:= regmatches(SCL,regexpr('Acute=[a-z]+',SCL))]
meta[, PERIOD_TR:= meta[, cut(TIME_TR, breaks=c(1980,2004,2014,Inf), labels=c('<=2004','2005-2014','>=2015'))]]
meta <- subset(meta, !is.na(RECENT_TR))[, list(RECENT_TR=mean(RECENT_TR=='Y')), by=c('SC','SCL', 'CONFIG', 'INT','AC','PERIOD_TR')]
ggplot(subset(meta, PERIOD_TR!='<=2004'), aes(x=AC, y=RECENT_TR, colour=SCL, alpha=CONFIG)) + geom_point() +
facet_grid(PERIOD_TR~INT) +
labs(x='\nsampled individuals in recent infection at diagnosis\nof those with information on recent infection',y='') +
scale_color_brewer(name='simulation data sets', palette='Paired') +
#scale_x_continuous(expand=c(0,0)) +
scale_y_continuous(labels = scales::percent) +
scale_alpha_manual(values=rep(1,meta[, length(unique(CONFIG))]), guide = FALSE) +
theme_bw() + theme(legend.position='bottom')
file <- file.path(outdir, '150715_TEST_PropRecentAtDiagByACSC.pdf')
ggsave(file=file, w=10, h=5)
}
##--------------------------------------------------------------------------------------------------------
## check simulated sequences: create ExaML tree and estimate R2
## olli 02.02.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.SSApg.CLUSTERBEAST.extendedskyline<- function()
{
require(phytools)
require(hivclust)
require(XML)
# load outgroup sequences
file <- system.file(package="PANGEA.HIV.sim", "misc",'PANGEA_SSAfg_HXB2outgroup.R')
cat(paste('\nLoading outgroup seq from file', file))
load(file) #expect "outgroup.seq.gag" "outgroup.seq.pol" "outgroup.seq.env"
tree.id.labelsep <- '|'
tree.id.label.idx.ctime <- 4
select <- 'mseq500'
#select <- 'mseq100'
indir <- '/Users/Oliver/duke/2014_Gates/methods_comparison_pipeline/150203'
#indir <- '/Users/Oliver/git/HPTN071sim/tmp140914/140716_RUN001_INTERNAL'
outdir <- indir
infiles <- list.files(indir, '.*INTERNAL.R$', full.names=FALSE)
#stopifnot(length(infiles)==1)
# read BEAST template files
infile.beast.pol <- system.file(package="PANGEA.HIV.sim", "misc",'BEAST_template_vTESTpol.xml')
bxml.template.polut <- xmlTreeParse(infile.beast.pol, useInternalNodes=TRUE, addFinalizer = TRUE)
infile.beast.pol <- system.file(package="PANGEA.HIV.sim", "misc",'BEAST_template_vTESTpol_fixedtree.xml')
#infile.beast.pol <- '~/git/HPTN071sim/source/PANGEA.HIV.sim/inst/misc/BEAST_template_vTESTpol_fixedtree.xml'
bxml.template.polft <- xmlTreeParse(infile.beast.pol, useInternalNodes=TRUE, addFinalizer = TRUE)
infile.beast.gag <- system.file(package="PANGEA.HIV.sim", "misc",'BEAST_template_vTESTgag.xml')
bxml.template.gag <- xmlTreeParse(infile.beast.gag, useInternalNodes=TRUE, addFinalizer = TRUE)
infile.beast.env <- system.file(package="PANGEA.HIV.sim", "misc",'BEAST_template_vTESTenv.xml')
bxml.template.env <- xmlTreeParse(infile.beast.env, useInternalNodes=TRUE, addFinalizer = TRUE)
#
# run
#
for(i in seq_along(infiles))
{
infile <- infiles[i]
# load simulated data
file <- paste(indir, '/', infile, sep='')
cat(paste('\nLoading file', file))
load(file) #expect "df.epi" "df.trms" "df.inds" "df.sample" "df.seq"
set( df.seq, NULL, 'IDCLU', df.seq[, as.integer(IDCLU)] )
#
if(grepl('nseq',select))
{
thresh.NSEQ <- as.numeric(substring(select, 5))
thresh.brl <- c(seq(0.001, 0.05, 0.001), seq(0.06, 0.5, 0.1))
thresh.nseq <- sapply(thresh.brl, function(x)
{
clustering <- hivc.clu.clusterbythresh(seq.ph, thresh.brl=x, dist.brl=tmp, retval="all")
which(clustering$size.tips>10)
length(which(!is.na(clustering$clu.mem[ seq_len(Ntip(seq.ph))] )))
})
thresh.brl <- thresh.brl[ which(thresh.nseq>=thresh.NSEQ)[1] ]
clustering <- hivc.clu.clusterbythresh(seq.ph, thresh.brl=thresh.brl, dist.brl=tmp, retval="all")
cat(paste('\nFound clusters, n=', length(clustering$clu.idx)))
seq.select <- subset( data.table( PH_NODE_ID=seq_len(Ntip(seq.ph)), CLU_ID=clustering$clu.mem[ seq_len(Ntip(seq.ph))] ), !is.na(CLU_ID) )
seq.select[, LABEL:= seq.select[, seq.ph$tip.label[PH_NODE_ID]] ]
seq.select <- merge(df.seq, seq.select, by='LABEL')
}
if(grepl('mseq',select))
{
thresh.NSEQ <- as.numeric(substring(select, 5))
seq.select <- subset(df.inds, !is.na(IDCLU))
seq.select <- subset(merge(seq.select, seq.select[, list(CLU_N=-length(which(!is.na(TIME_SEQ)))), by='IDCLU'], by='IDCLU'), CLU_N<0 & !is.na(TIME_SEQ))
setkey(seq.select, CLU_N, IDCLU)
tmp <- unique(seq.select)
tmp[, CLU_CN:= tmp[,cumsum(-CLU_N)]]
tmp <- tmp[seq_len( tmp[, which(CLU_CN>=thresh.NSEQ)[1]] ), ]
seq.select <- merge( seq.select, subset(tmp, select=IDCLU), by='IDCLU' )
seq.select <- merge(df.seq, seq.select, by=c('IDCLU','IDPOP'))
cat(paste('\nFound clusters, n=', seq.select[, length(unique(IDCLU))]))
cat(paste('\nFound sequences, n=', seq.select[, length(unique(IDPOP))]))
}
#
# read NEWICK trees for each cluster phylogeny, if there
#
tmp <- list.files(indir, '_DATEDTREE.newick$', full.names=FALSE)
tmp <- tmp[ grepl(substr(infile, 1, regexpr('_SIMULATED',infile)), tmp) ]
if(length(tmp))
{
# select
phd <- read.tree(paste(indir, tmp, sep='/'))
tmp2 <- data.table(IDPOP= sapply(phd, function(x) x$tip.label[1]), IDX=seq_along(phd))
set( tmp2, NULL, 'IDPOP', tmp2[, as.integer(substring(sapply(strsplit(IDPOP, tree.id.labelsep, fixed=TRUE),'[[',1),7)) ] )
tmp2 <- merge(subset(seq.select, select=c(IDPOP, IDCLU)), tmp2, by='IDPOP')
phd <- lapply(tmp2[,IDX], function(i) phd[[i]] )
names(phd) <- tmp2[, IDCLU]
# plot
phd.plot <- eval(parse(text=paste('phd[[',seq_along(phd),']]', sep='',collapse='+')))
#phd.plot <- drop.tip(phd.plot, which(grepl('NOEXIST', phd.plot$tip.label)), root.edge=1)
phd.plot <- ladderize(phd.plot)
tmp <- paste(indir, '/', gsub('DATEDTREE','BEASTDATEDTREE',tmp), sep='')
pdf(file=gsub('newick','pdf',tmp), w=10, h=Ntip(phd.plot)*0.1)
plot(phd.plot, show.tip=TRUE, cex=0.5)
dev.off()
bxml.template.pol <- bxml.template.polft
}
else
{
phd <- NULL
bxml.template.pol <- bxml.template.polut
}
#
# create BEAST XML
#
#
# POL
#
if(1)
{
cat(paste('\ncreate POL BEAST XML file for seqs=',paste( seq.select[,LABEL], collapse=' ')))
pool.infile <- paste( substr(infile,1,nchar(infile)-21),'_TEST_pol', sep='' )
# write XML file with new sequences
bxml <- newXMLDoc(addFinalizer=T)
bxml.beast <- newXMLNode("beast", doc=bxml, addFinalizer=T)
tmp <- newXMLCommentNode(text=paste("Generated by HIVCLUST from template",infile.beast.pol), parent=bxml.beast, doc=bxml, addFinalizer=T)
# add new taxa
tmp <- subset(seq.select, select=LABEL)
tmp[, BEASTlabel:=LABEL]
setnames(tmp, 'LABEL', 'FASTASampleCode')
bxml <- hivc.beast.add.taxa(bxml, tmp, beast.label.datepos= 4, beast.label.sep= '|', beast.date.direction= "forwards", beast.date.units= "years", verbose=1)
# add alignment for every transmission cluster
for(clu in seq.select[, unique(IDCLU)])
{
tmp <- tolower(do.call('rbind',strsplit(subset(seq.select, IDCLU==clu)[, POL],'')))
rownames(tmp) <- subset(seq.select, IDCLU==clu)[, LABEL]
tmp <- as.DNAbin(tmp)
bxml <- hivc.beast.add.alignment(bxml, tmp, beast.alignment.id=paste("POL.alignment_CLU",clu,sep=''), beast.alignment.dataType= "nucleotide", verbose=1)
}
# add starting tree for every transmission cluster
for(clu in names(phd))
{
bxml <- hivc.beast.add.startingtree(bxml, phd[[clu]], beast.rootHeight= NA, beast.usingDates="true", beast.newickid=paste("startingTree_CLU",clu,sep=''), beast.brlunits="years", verbose=1)
}
# add CODON patterns for every alignment
for(clu in seq.select[, unique(IDCLU)])
for(k in 1:3)
{
bxml <- hivc.beast.add.patterns(bxml, paste("POL.n100.rlx.gmrf.CP",k,".patterns_CLU",clu,sep=''), paste("POL.alignment_CLU",clu,sep=''), k, beast.patterns.every=3, beast.patterns.strip='false', verbose=1)
}
# add tree model for every alignment
for(clu in seq.select[, unique(IDCLU)])
{
bxml <- hivc.beast.add.treemodel(bxml, paste("treeModel_CLU",clu,sep=''), paste("treeModel.rootHeight_CLU",clu,sep=''), paste("treeModel.internalNodeHeights_CLU",clu,sep=''), paste("treeModel.allInternalNodeHeights_CLU",clu,sep=''), newick.id=paste("startingTree_CLU",clu,sep=''), internalNodes='true', rootNode='true', verbose=1)
}
# add variableDemographic
tmp <- paste('treeModel_CLU',seq.select[, unique(IDCLU)], sep='')
bxml <- hivc.beast.add.variableDemographic(bxml, 'demographic', tmp, 'coalescent', type='stepwise', useMidpoints='true',
popSize.id='demographic.popSize', popSize.value='500.0',
demographic.indicators.id='demographic.indicators', demographic.indicators.value='0.0',
demographic.popMeanDist.id='demographic.populationMeanDist', demographic.popMean.id='demographic.populationMean', demographic.popMean.value='500.0',
sumStatistic.id='demographic.populationSizeChanges', sumStatistic.elementwise='true',
verbose=1)
# population mean value: fix?
# add uncorrelated relaxed clock for every transmission cluster
tmp <- seq.select[, unique(IDCLU)]
clu <- tmp[1]
bxml <- hivc.beast.add.discretizedBranchRates(bxml, paste("branchRates_CLU",clu,sep=''), paste("treeModel_CLU",clu,sep=''), paste("branchRates.categories_CLU",clu,sep=''),
mean.id="ucld.mean", mean.value="0.0025", mean.lower="0.0", sd.id="ucld.stdev", sd.value="0.3333333333333333", sd.lower="0.0", meanInRealSpace="true",
rateCategories.dimension= subset(seq.select, IDCLU==clu)[, 2*(length(unique(IDPOP))-1)], verbose=1)
for(clu in tmp[-1])
{
bxml <- hivc.beast.add.discretizedBranchRates(bxml, paste("branchRates_CLU",clu,sep=''), paste("treeModel_CLU",clu,sep=''), paste("branchRates.categories_CLU",clu,sep=''),
mean.idref="ucld.mean", sd.idref="ucld.stdev", meanInRealSpace="true",
rateCategories.dimension= subset(seq.select, IDCLU==clu)[, 2*(length(unique(IDPOP))-1)], verbose=1)
}
# add rate statistic for first cluster
bxml <- hivc.beast.add.rateStatistics(bxml, 'branchRates', paste('treeModel_CLU',seq.select[1, IDCLU], sep=''), paste('branchRates_CLU',seq.select[1, IDCLU], sep=''))
# add GTR CP1 2 3 models
for(k in 1:3)
{
clu <- seq.select[, unique(IDCLU)][1]
dummy <- hivc.beast.add.gtrModel(bxml, paste("POL.n100.rlx.gmrf.CP",k,".gtr_CLU", clu,sep=""), paste("POL.n100.rlx.gmrf.CP",k,".patterns_CLU",clu,sep=''), paste("POL.n100.rlx.gmrf.CP",k,".frequencies_CLU",clu,sep=''),
rateAC.id=paste("POL.n100.rlx.gmrf.CP",k,".ac",sep=""), rateAC.value="0.3", rateAC.lower="0.0",
rateAG.id=paste("POL.n100.rlx.gmrf.CP",k,".ag",sep=""), rateAG.value="1.0", rateAG.lower="0.0",
rateAT.id=paste("POL.n100.rlx.gmrf.CP",k,".at",sep=""), rateAT.value="0.2", rateAT.lower="0.0",
rateCG.id=paste("POL.n100.rlx.gmrf.CP",k,".cg",sep=""), rateCG.value="0.2", rateCG.lower="0.0",
rateGT.id=paste("POL.n100.rlx.gmrf.CP",k,".gt",sep=""), rateGT.value="0.1", rateGT.lower="0.0",
frequencies.dimension='4', frequencyModel.dataType='nucleotide')
for(clu in seq.select[, unique(IDCLU)][-1])
dummy <- hivc.beast.add.gtrModel(bxml, paste("POL.n100.rlx.gmrf.CP",k,".gtr_CLU", clu,sep=""), paste("POL.n100.rlx.gmrf.CP",k,".patterns_CLU",clu,sep=''), paste("POL.n100.rlx.gmrf.CP",k,".frequencies_CLU",clu,sep=''),
rateAC.idref=paste("POL.n100.rlx.gmrf.CP",k,".ac",sep=""),
rateAG.idref=paste("POL.n100.rlx.gmrf.CP",k,".ag",sep=""),
rateAT.idref=paste("POL.n100.rlx.gmrf.CP",k,".at",sep=""),
rateCG.idref=paste("POL.n100.rlx.gmrf.CP",k,".cg",sep=""),
rateGT.idref=paste("POL.n100.rlx.gmrf.CP",k,".gt",sep=""),
frequencies.dimension='4', frequencyModel.dataType='nucleotide')
}
# add CP1 2 3 site models
for(k in 1:3)
{
clu <- seq.select[, unique(IDCLU)][1]
dummy <- hivc.beast.add.siteModel(bxml, paste("POL.n100.rlx.gmrf.CP",k,".siteModel_CLU", clu,sep=""), paste("POL.n100.rlx.gmrf.CP",k,".gtr_CLU", clu,sep=""),
relativeRate.id=paste("POL.n100.rlx.gmrf.CP",k,".mu",sep=""), relativeRate.value="1.0", relativeRate.lower="0.0",
gamma.id=paste("POL.n100.rlx.gmrf.CP",k,".alpha",sep=""), gamma.value="0.5", gamma.lower="0.0",
gammaCategories="4", verbose=1)
for(clu in seq.select[, unique(IDCLU)][-1])
dummy <- hivc.beast.add.siteModel(bxml, paste("POL.n100.rlx.gmrf.CP",k,".siteModel_CLU", clu,sep=""), paste("POL.n100.rlx.gmrf.CP",k,".gtr_CLU", clu,sep=""),
relativeRate.idref=paste("POL.n100.rlx.gmrf.CP",k,".mu",sep=""), gamma.idref=paste("POL.n100.rlx.gmrf.CP",k,".alpha",sep=""), gammaCategories="4", verbose=1)
}
# copy compoundParameter
dummy <- addChildren( bxml.beast, xmlClone(getNodeSet(bxml.template.pol, paste("//*[@id='ALL.n100.rlx.gmrf.allMus']",sep=''))[[1]], addFinalizer=T, doc=bxml) )
# add treeLikelihood
for(clu in seq.select[, unique(IDCLU)])
for(k in 1:3)
{
dummy <- hivc.beast.add.treeLikelihood(bxml, paste("POL.n100.rlx.gmrf.CP",k,".treeLikelihood_CLU",clu,sep=""),
paste("POL.n100.rlx.gmrf.CP",k,".patterns_CLU",clu,sep=''),
paste("treeModel_CLU",clu,sep=''),
paste('POL.n100.rlx.gmrf.CP',k,'.siteModel_CLU',clu,sep=''),
paste("branchRates_CLU",clu,sep=''), useAmbiguities='false', verbose=1)
}
# add compound likelihood
tmp <- as.vector(sapply(seq.select[, unique(IDCLU)], function(clu) sapply(1:3,function(k){ paste("POL.n100.rlx.gmrf.CP",k,".treeLikelihood_CLU",clu,sep="") }) ))
bxml <- hivc.beast.add.likelihood(bxml, 'likelihood', tmp)
# copy operators
dummy <- addChildren( bxml.beast, xmlClone(getNodeSet(bxml.template.pol, paste("//operators",sep=''))[[1]], addFinalizer=T, doc=bxml) )
# add branch rate category operators
for(clu in seq.select[, unique(IDCLU)])
{
bxml <- hivc.beast.add.uniformIntegerOperator(bxml, paste("branchRates.categories_CLU",clu,sep=''), "10")
bxml <- hivc.beast.add.swapOperator(bxml, paste("branchRates.categories_CLU",clu,sep=''), "1", "10", "false")
}
# copy mcmc
dummy <- addChildren( bxml.beast, xmlClone(getNodeSet(bxml.template.pol, paste("//mcmc",sep=''))[[1]], addFinalizer=T, doc=bxml) )
# add tree log for every transmission cluster
for(clu in seq.select[, unique(IDCLU)])
{
bxml <- hivc.beast.add.logTree(bxml, paste("logTree_CLU",clu,sep=''), paste("treeModel_CLU",clu,sep=''), paste("branchRates_CLU",clu,sep=''), 'posterior', logEvery='3000', fileName=paste(pool.infile,'_CLU', clu, '_', select, '.trees', sep=''), nexusFormat='true', sortTranslationTable='true', verbose=1)
}
# change MCMC attributes
bxml <- hivc.beast.adjust.mcmc(bxml, beast.mcmc.chainLength=3000000, beast.mcmc.logEvery=3000, verbose=1)
# change outfile name of log
tmp <- getNodeSet(bxml, "//*[@id='fileLog']")[[1]]
xmlAttrs(tmp)["fileName"] <- paste(pool.infile,'_', select, '.log', sep='')
# add VDAnalysis
tmp <- sapply(seq.select[, unique(IDCLU)], function(clu) paste(pool.infile,'_CLU', clu, '_', select, '.trees', sep=''))
bxml <- hivc.beast.add.VDAnalysis(bxml, 'demographic.analysis', paste(pool.infile,'_', select, '.log', sep=''), tmp, paste(pool.infile,'_', select, '_VDA.csv', sep=''), populationModelType='stepwise', populationFirstColumn='demographic.popSize1', indicatorsFirstColumn='demographic.indicators1', burnIn="0.1", useMidpoints="true", verbose=1)
# write to file
file <- paste(indir,'/',pool.infile,'_',select,".xml", sep='')
cat(paste("\nwrite xml file to",file))
saveXML(bxml, file=file)
}
}
}
##--------------------------------------------------------------------------------------------------------
## check simulated sequences: create ExaML tree and estimate R2
## olli 14.09.14
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.SSApg.BEAST<- function()
{
require(phytools)
require(hivclust)
require(XML)
# load outgroup sequences
file <- system.file(package="PANGEA.HIV.sim", "misc",'PANGEA_SSAfg_HXB2outgroup.R')
cat(paste('\nLoading outgroup seq from file', file))
load(file) #expect "outgroup.seq.gag" "outgroup.seq.pol" "outgroup.seq.env"
tree.id.labelsep <- '|'
tree.id.label.idx.ctime <- 4
select <- 'nseq800'
select <- 'nseq500'
select <- 'mseq100'
indir <- '/Users/Oliver/duke/2014_Gates/methods_comparison_pipeline/150201b'
#indir <- '/Users/Oliver/git/HPTN071sim/tmp140914/140716_RUN001_INTERNAL'
outdir <- indir
infiles <- list.files(indir, '.*INTERNAL.R$', full.names=FALSE)
#stopifnot(length(infiles)==1)
# read BEAST template files
infile.beast.pol <- system.file(package="PANGEA.HIV.sim", "misc",'BEAST_template_vTESTpol.xml')
bxml.template.polut <- xmlTreeParse(infile.beast.pol, useInternalNodes=TRUE, addFinalizer = TRUE)
infile.beast.pol <- system.file(package="PANGEA.HIV.sim", "misc",'BEAST_template_vTESTpol_fixedtree.xml')
bxml.template.polft <- xmlTreeParse(infile.beast.pol, useInternalNodes=TRUE, addFinalizer = TRUE)
infile.beast.gag <- system.file(package="PANGEA.HIV.sim", "misc",'BEAST_template_vTESTgag.xml')
bxml.template.gag <- xmlTreeParse(infile.beast.gag, useInternalNodes=TRUE, addFinalizer = TRUE)
infile.beast.env <- system.file(package="PANGEA.HIV.sim", "misc",'BEAST_template_vTESTenv.xml')
bxml.template.env <- xmlTreeParse(infile.beast.env, useInternalNodes=TRUE, addFinalizer = TRUE)
#
# run
#
for(i in seq_along(infiles))
{
infile <- infiles[i]
# load simulated data
file <- paste(indir, '/', infile, sep='')
cat(paste('\nLoading file', file))
load(file) #expect "df.epi" "df.trms" "df.inds" "df.sample" "df.seq"
# concatenate sequences
tmp <- tolower(do.call('rbind',strsplit(df.seq[, GAG],'')))
rownames(tmp) <- df.seq[, LABEL]
df.seq.gag <- as.DNAbin(tmp)
tmp <- tolower(do.call('rbind',strsplit(df.seq[, POL],'')))
rownames(tmp) <- df.seq[, LABEL]
df.seq.pol <- as.DNAbin(tmp)
tmp <- tolower(do.call('rbind',strsplit(df.seq[, ENV],'')))
rownames(tmp) <- df.seq[, LABEL]
df.seq.env <- as.DNAbin(tmp)
seq <- cbind(df.seq.gag,df.seq.pol,df.seq.env)
tmp <- cbind(outgroup.seq.gag[,1:ncol(df.seq.gag)], outgroup.seq.pol, outgroup.seq.env)
seq <- rbind(seq,tmp)
tmp <- dist.dna( seq, model='raw' )
seq.ph <- nj(tmp)
tmp <- which(seq.ph$tip.label=="HXB2")
seq.ph <- reroot(seq.ph, tmp, seq.ph$edge.length[which(seq.ph$edge[,2]==tmp)])
#
tmp <- hivc.clu.brdist.stats(seq.ph, eval.dist.btw="leaf", stat.fun=hivc.clu.min.transmission.cascade)
if(grepl('nseq',select))
{
thresh.NSEQ <- as.numeric(substring(select, 5))
thresh.brl <- c(seq(0.001, 0.05, 0.001), seq(0.06, 0.5, 0.1))
thresh.nseq <- sapply(thresh.brl, function(x)
{
clustering <- hivc.clu.clusterbythresh(seq.ph, thresh.brl=x, dist.brl=tmp, retval="all")
which(clustering$size.tips>10)
length(which(!is.na(clustering$clu.mem[ seq_len(Ntip(seq.ph))] )))
})
thresh.brl <- thresh.brl[ which(thresh.nseq>=thresh.NSEQ)[1] ]
clustering <- hivc.clu.clusterbythresh(seq.ph, thresh.brl=thresh.brl, dist.brl=tmp, retval="all")
cat(paste('\nFound clusters, n=', length(clustering$clu.idx)))
seq.select <- subset( data.table( PH_NODE_ID=seq_len(Ntip(seq.ph)), CLU_ID=clustering$clu.mem[ seq_len(Ntip(seq.ph))] ), !is.na(CLU_ID) )
seq.select[, LABEL:= seq.select[, seq.ph$tip.label[PH_NODE_ID]] ]
seq.select <- merge(df.seq, seq.select, by='LABEL')
}
if(grepl('mseq',select))
{
thresh.NSEQ <- as.numeric(substring(select, 5))
seq.select <- subset(df.inds, !is.na(IDCLU))
seq.select <- subset(merge(seq.select, seq.select[, list(CLU_N=-length(which(!is.na(TIME_SEQ)))), by='IDCLU'], by='IDCLU'), CLU_N<0 & !is.na(TIME_SEQ))
setkey(seq.select, CLU_N, IDCLU)
tmp <- unique(seq.select)
tmp[, CLU_CN:= tmp[,cumsum(-CLU_N)]]
tmp <- tmp[seq_len( tmp[, which(CLU_CN>=thresh.NSEQ)[1]] ), ]
seq.select <- merge( seq.select, subset(tmp, select=IDCLU), by='IDCLU' )
cat(paste('\nFound clusters, n=', seq.select[, length(unique(IDCLU))]))
cat(paste('\nFound sequences, n=', seq.select[, length(unique(IDPOP))]))
}
if(select=='same')
{
thresh.brl <- seq(0.01, 0.05, 0.001)
thresh.nclu <- sapply(thresh.brl, function(x)
{
clustering <- hivc.clu.clusterbythresh(seq.ph, thresh.brl=x, dist.brl=tmp, retval="all")
max( clustering$size.tips )
})
thresh.brl <- thresh.brl[ which(thresh.nclu>=100)[1] ]
clustering <- hivc.clu.clusterbythresh(seq.ph, thresh.brl=thresh.brl, dist.brl=tmp, retval="all")
cat(paste('\nFound clusters, n=', length(clustering$clu.idx)))
# Take 1 sequence from each cluster
tmp <- which( clustering$clu.mem==which.max( clustering$size.tips ) )
seq.select <- data.table( PH_NODE_ID=tmp[ which(tmp<=Ntip(seq.ph)) ], CLU_ID=which.max(clustering$size.tips) )
seq.select <- seq.select[1:100,]
seq.select[, LABEL:=seq.ph$tip.label[PH_NODE_ID] ]
seq.select <- merge(df.seq, seq.select, by='LABEL')
cat(paste('\nSelected sequences, n=',nrow(seq.select)))
}
if(select=='divergent')
{
# find thresh with ~100 clusters
thresh.brl <- c(seq(0.001, 0.05, 0.001), seq(0.06, 0.5, 0.1))
thresh.nclu <- sapply(thresh.brl, function(x)
{
clustering <- hivc.clu.clusterbythresh(seq.ph, thresh.brl=x, dist.brl=tmp, retval="all")
length(clustering$clu.idx)
})
tmp2 <- which(thresh.nclu>100)
thresh.brl <- ifelse(length(tmp2), thresh.brl[tmp2][1], thresh.brl[which.max(thresh.nclu)])
clustering <- hivc.clu.clusterbythresh(seq.ph, thresh.brl=thresh.brl, dist.brl=tmp, retval="all")
cat(paste('\nFound clusters, n=', length(clustering$clu.idx)))
# Take 1 sequence from each cluster
seq.select <- data.table( PH_NODE_ID=seq_len(Ntip(seq.ph)), CLU_ID=clustering$clu.mem[ seq_len(Ntip(seq.ph)) ] )
seq.select <- subset(seq.select, !is.na(CLU_ID))[, list(LABEL= seq.ph$tip.label[PH_NODE_ID[1]]), by='CLU_ID']
seq.select <- merge(df.seq, seq.select, by='LABEL')
}
#
# read NEWICK trees for each cluster phylogeny, if there
#
tmp <- list.files(indir, '_DATEDTREE.newick$', full.names=FALSE)
tmp <- tmp[ grepl(substr(infile, 1, regexpr('_SIMULATED',infile)), tmp) ]
if(length(tmp))
{
# select
phd <- read.tree(paste(indir, tmp, sep='/'))
tmp2 <- data.table(IDCLU= as.integer(substring(names(phd),7)), IDX=seq_along(phd))
tmp2 <- merge(unique(seq.select), tmp2, by='IDCLU')[, unique(IDX)]
phd <- lapply(tmp2, function(i) phd[[i]] )
# plot
phd.plot <- eval(parse(text=paste('phd[[',seq_along(phd),']]', sep='',collapse='+')))
#phd.plot <- drop.tip(phd.plot, which(grepl('NOEXIST', phd.plot$tip.label)), root.edge=1)
phd.plot <- ladderize(phd.plot)
tmp <- paste(indir, '/', gsub('DATEDTREE','BEASTDATEDTREE',tmp), sep='')
pdf(file=gsub('newick','pdf',tmp), w=10, h=Ntip(phd.plot)*0.1)
plot(phd.plot, show.tip=TRUE, cex=0.5)
dev.off()
bxml.template.pol <- bxml.template.polft
}
else
{
phd <- NULL
bxml.template.pol <- bxml.template.polut
}
#
# create BEAST XML
#
if(0)
{
#
# GAG
#
cat(paste('\ncreate GAG BEAST XML file for seqs=',paste( seq.select[,LABEL], collapse=' ')))
pool.infile <- paste( substr(infile,1,nchar(infile)-21),'_TEST_gag', sep='' )
# write XML file with new sequences
bxml <- newXMLDoc(addFinalizer=T)
bxml.beast <- newXMLNode("beast", doc=bxml, addFinalizer=T)
tmp <- newXMLCommentNode(text=paste("Generated by HIVCLUST from template",infile.beast.pol), parent=bxml.beast, doc=bxml, addFinalizer=T)
# add new set of GAG sequences into GAG alignment
tmp <- tolower(do.call('rbind',strsplit(seq.select[, GAG],'')))
rownames(tmp) <- seq.select[, LABEL]
tmp <- as.DNAbin(tmp)
bxml <- hivc.beast.add.seq(bxml, tmp, df=NULL, beast.label.datepos=4, beast.label.sep= '|', beast.date.direction= "forwards", beast.date.units= "years", beast.alignment.id="GAG.alignment", beast.alignment.dataType= "nucleotide", verbose=1)
# copy from template
bt.beast <- getNodeSet(bxml.template.gag, "//beast")[[1]]
dummy <- sapply(seq.int( 1, xmlSize(bt.beast) ), function(i)
{
if( class(bt.beast[[i]])[1]=="XMLInternalCommentNode" )
dummy<- newXMLCommentNode(text=xmlValue(bt.beast[[i]]), parent=bxml.beast, doc=bxml, addFinalizer=T)
else
dummy<- addChildren( bxml.beast, xmlClone( bt.beast[[i]], addFinalizer=T, doc=bxml ) )
})
# change gmrf dimensions
tmp <- getNodeSet(bxml, "//*[@id='skyride.logPopSize']")
if(length(tmp)!=1) stop("unexpected number of *[@id='skyride.logPopSize'")
tmp <- tmp[[1]]
xmlAttrs(tmp)["dimension"] <- nrow(seq.select)-1
tmp <- getNodeSet(bxml, "//*[@id='skyride.groupSize']")
if(length(tmp)!=1) stop("unexpected number of *[@id='skyride.groupSize'")
tmp <- tmp[[1]]
xmlAttrs(tmp)["dimension"] <- nrow(seq.select)-1
# change outfile name
bxml.onodes <- getNodeSet(bxml, "//*[@fileName]")
tmp <- sapply(bxml.onodes, function(x) xmlGetAttr(x,"fileName"))
tmp <- gsub("(time).","time",tmp,fixed=1)
tmp <- gsub("(subst).","subst",tmp,fixed=1)
tmp <- sapply(strsplit(tmp,'.',fixed=1), function(x) paste(pool.infile,'_',select, '.', tail(x,1), sep=''))
dummy <- sapply(seq_along(bxml.onodes), function(i){ xmlAttrs(bxml.onodes[[i]])["fileName"]<- tmp[i] })
# write to file
file <- paste(indir,'/',pool.infile,'_',select,".xml", sep='')
cat(paste("\nwrite xml file to",file))
saveXML(bxml, file=file)
}
#
# POL
#
if(1)
{
cat(paste('\ncreate POL BEAST XML file for seqs=',paste( seq.select[,LABEL], collapse=' ')))
pool.infile <- paste( substr(infile,1,nchar(infile)-21),'_TEST_pol', sep='' )
# write XML file with new sequences
bxml <- newXMLDoc(addFinalizer=T)
bxml.beast <- newXMLNode("beast", doc=bxml, addFinalizer=T)
tmp <- newXMLCommentNode(text=paste("Generated by HIVCLUST from template",infile.beast.pol), parent=bxml.beast, doc=bxml, addFinalizer=T)
# add new set of POL sequences into POL alignment
tmp <- tolower(do.call('rbind',strsplit(seq.select[, POL],'')))
rownames(tmp) <- seq.select[, LABEL]
tmp <- as.DNAbin(tmp)
bxml <- hivc.beast.add.seq(bxml, tmp, df=NULL, beast.label.datepos=4, beast.label.sep= '|', beast.date.direction= "forwards", beast.date.units= "years", beast.alignment.id="POL.alignment", beast.alignment.dataType= "nucleotide", verbose=1)
df.seq[, BEASTlabel:= LABEL]
setnames(df.seq, 'LABEL', 'FASTASampleCode')
bxml <- hivc.beast.add.startingtree(bxml, phd, df.seq, beast.rootHeight= NA, beast.usingDates="true", beast.newickid= "startingTree", beast.brlunits="years", verbose=1)
# copy from template
bt.beast <- getNodeSet(bxml.template.pol, "//beast")[[1]]
dummy <- sapply(seq.int( 1, xmlSize(bt.beast) ), function(i)
{
if( class(bt.beast[[i]])[1]=="XMLInternalCommentNode" )
dummy<- newXMLCommentNode(text=xmlValue(bt.beast[[i]]), parent=bxml.beast, doc=bxml, addFinalizer=T)
else
dummy<- addChildren( bxml.beast, xmlClone( bt.beast[[i]], addFinalizer=T, doc=bxml ) )
})
# change gmrf dimensions
tmp <- getNodeSet(bxml, "//*[@id='skyride.logPopSize']")
if(length(tmp)!=1) stop("unexpected number of *[@id='skyride.logPopSize'")
tmp <- tmp[[1]]
xmlAttrs(tmp)["dimension"] <- nrow(seq.select)-1
tmp <- getNodeSet(bxml, "//*[@id='skyride.groupSize']")
if(length(tmp)!=1) stop("unexpected number of *[@id='skyride.groupSize'")
tmp <- tmp[[1]]
xmlAttrs(tmp)["dimension"] <- nrow(seq.select)-1
# change MCMC attributes
if(!is.null(phd))
{
bxml <- hivc.beast.adjust.mcmc(bxml, beast.mcmc.chainLength=3000000, beast.mcmc.logEvery=3000, verbose=1)
}
# change outfile name
bxml.onodes <- getNodeSet(bxml, "//*[@fileName]")
tmp <- sapply(bxml.onodes, function(x) xmlGetAttr(x,"fileName"))
tmp <- gsub("(time).","time",tmp,fixed=1)
tmp <- gsub("(subst).","subst",tmp,fixed=1)
tmp <- sapply(strsplit(tmp,'.',fixed=1), function(x) paste(pool.infile,'_',select, '.', tail(x,1), sep=''))
dummy <- sapply(seq_along(bxml.onodes), function(i){ xmlAttrs(bxml.onodes[[i]])["fileName"]<- tmp[i] })
# write to file
file <- paste(indir,'/',pool.infile,'_',select,".xml", sep='')
cat(paste("\nwrite xml file to",file))
saveXML(bxml, file=file)
}
#
# ENV
#
if(0)
{
cat(paste('\ncreate ENV BEAST XML file for seqs=',paste( seq.select[,LABEL], collapse=' ')))
pool.infile <- paste( substr(infile,1,nchar(infile)-21),'_TEST_env', sep='' )
# write XML file with new sequences
bxml <- newXMLDoc(addFinalizer=T)
bxml.beast <- newXMLNode("beast", doc=bxml, addFinalizer=T)
tmp <- newXMLCommentNode(text=paste("Generated by HIVCLUST from template",infile.beast.pol), parent=bxml.beast, doc=bxml, addFinalizer=T)
# add new set of GAG sequences into ENV alignment
tmp <- tolower(do.call('rbind',strsplit(seq.select[, ENV],'')))
rownames(tmp) <- seq.select[, LABEL]
tmp <- as.DNAbin(tmp)
bxml <- hivc.beast.add.seq(bxml, tmp, df=NULL, beast.label.datepos=4, beast.label.sep= '|', beast.date.direction= "forwards", beast.date.units= "years", beast.alignment.id="ENV.alignment", beast.alignment.dataType= "nucleotide", verbose=1)
# copy from template
bt.beast <- getNodeSet(bxml.template.env, "//beast")[[1]]
dummy <- sapply(seq.int( 1, xmlSize(bt.beast) ), function(i)
{
if( class(bt.beast[[i]])[1]=="XMLInternalCommentNode" )
dummy<- newXMLCommentNode(text=xmlValue(bt.beast[[i]]), parent=bxml.beast, doc=bxml, addFinalizer=T)
else
dummy<- addChildren( bxml.beast, xmlClone( bt.beast[[i]], addFinalizer=T, doc=bxml ) )
})
# change gmrf dimensions
tmp <- getNodeSet(bxml, "//*[@id='skyride.logPopSize']")
if(length(tmp)!=1) stop("unexpected number of *[@id='skyride.logPopSize'")
tmp <- tmp[[1]]
xmlAttrs(tmp)["dimension"] <- nrow(seq.select)-1
tmp <- getNodeSet(bxml, "//*[@id='skyride.groupSize']")
if(length(tmp)!=1) stop("unexpected number of *[@id='skyride.groupSize'")
tmp <- tmp[[1]]
xmlAttrs(tmp)["dimension"] <- nrow(seq.select)-1
# change outfile name
bxml.onodes <- getNodeSet(bxml, "//*[@fileName]")
tmp <- sapply(bxml.onodes, function(x) xmlGetAttr(x,"fileName"))
tmp <- gsub("(time).","time",tmp,fixed=1)
tmp <- gsub("(subst).","subst",tmp,fixed=1)
tmp <- sapply(strsplit(tmp,'.',fixed=1), function(x) paste(pool.infile,'_',select, '.', tail(x,1), sep=''))
dummy <- sapply(seq_along(bxml.onodes), function(i){ xmlAttrs(bxml.onodes[[i]])["fileName"]<- tmp[i] })
# write to file
file <- paste(indir,'/',pool.infile,'_',select,".xml", sep='')
cat(paste("\nwrite xml file to",file))
saveXML(bxml, file=file)
}
}
}
##--------------------------------------------------------------------------------------------------------
## check simulated sequences: create ExaML tree and estimate R2
## olli 14.09.14
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.SSApg.ExaMLR2<- function()
{
require(phytools)
require(hivclust)
tree.id.labelsep <- '|'
tree.id.label.idx.ctime <- 4
indir <- '/Users/Oliver/duke/2014_Gates/methods_comparison_pipeline/150227'
outdir <- '/Users/Oliver/duke/2014_Gates/methods_comparison_pipeline/150227'
infiles <- list.files(indir, '.*INTERNAL.R$', full.names=FALSE)
#stopifnot(length(infiles)==1)
#
# run ExaML
#
for(i in seq_along(infiles))
{
infile <- infiles[i]
# load simulated data
file <- paste(indir, '/', infile, sep='')
cat(paste('\nLoading file', file))
load(file) #expect "df.epi" "df.trms" "df.inds" "df.sample" "df.seq"
# load outgroup sequences
file <- system.file(package="PANGEA.HIV.sim", "misc",'PANGEA_SSAfg_HXB2outgroup.R')
cat(paste('\nLoading outgroup seq from file', file))
load(file) #expect "outgroup.seq.gag" "outgroup.seq.pol" "outgroup.seq.env"
tmp <- tolower(do.call('rbind',strsplit(df.seq[, GAG],'')))
rownames(tmp) <- df.seq[, LABEL]
df.seq.gag <- as.DNAbin(tmp)
tmp <- tolower(do.call('rbind',strsplit(df.seq[, POL],'')))
rownames(tmp) <- df.seq[, LABEL]
df.seq.pol <- as.DNAbin(tmp)
tmp <- tolower(do.call('rbind',strsplit(df.seq[, ENV],'')))
rownames(tmp) <- df.seq[, LABEL]
df.seq.env <- as.DNAbin(tmp)
#
# run ExaML on gag
#
seq <- df.seq.gag
seq <- rbind(seq, outgroup.seq.gag[, seq_len(ncol(seq))])
infile.seq.sig <- "Sun_Sep_14_12:59:06_2013"
infile.seq <- paste(substr(infile,1,nchar(infile)-20),'INFO_simu_gagseq',sep='')
file <- paste( outdir, '/', infile.seq,'_',gsub('/',':',infile.seq.sig),'.R', sep='' )
save(seq, file=file)
# run ExaML
cmd <- hivc.cmd.examl.bootstrap.on.one.machine(indir, infile.seq, infile.seq.sig, infile.seq.sig, bs.from=0, bs.to=0, verbose=1)
cmd <- hivc.cmd.hpcwrapper(cmd, hpc.walltime=21, hpc.q= NA, hpc.mem="450mb", hpc.nproc=1)
cmd.hpccaller(outdir, paste("exa",paste(strsplit(date(),split=' ')[[1]],collapse='_',sep=''),sep='.'), cmd)
Sys.sleep(1)
#
# run ExaML on pol
#
seq <- df.seq.pol
seq <- rbind(seq, outgroup.seq.pol[, seq_len(ncol(seq))])
infile.seq.sig <- "Sun_Sep_14_12:59:06_2013"
infile.seq <- paste(substr(infile,1,nchar(infile)-20),'INFO_simu_polseq',sep='')
file <- paste( outdir, '/', infile.seq,'_',gsub('/',':',infile.seq.sig),'.R', sep='' )
save(seq, file=file)
# run ExaML
cmd <- hivc.cmd.examl.bootstrap.on.one.machine(indir, infile.seq, infile.seq.sig, infile.seq.sig, bs.from=0, bs.to=0, verbose=1)
cmd <- hivc.cmd.hpcwrapper(cmd, hpc.walltime=21, hpc.q= NA, hpc.mem="450mb", hpc.nproc=1)
cmd.hpccaller(outdir, paste("exa",paste(strsplit(date(),split=' ')[[1]],collapse='_',sep=''),sep='.'), cmd)
Sys.sleep(1)
#
# run ExaML on env
#
seq <- df.seq.env
seq <- rbind(seq, outgroup.seq.env[, seq_len(ncol(seq))])
infile.seq.sig <- "Sun_Sep_14_12:59:06_2013"
infile.seq <- paste(substr(infile,1,nchar(infile)-20),'INFO_simu_envseq',sep='')
file <- paste( outdir, '/', infile.seq,'_',gsub('/',':',infile.seq.sig),'.R', sep='' )
save(seq, file=file)
# run ExaML
cmd <- hivc.cmd.examl.bootstrap.on.one.machine(indir, infile.seq, infile.seq.sig, infile.seq.sig, bs.from=0, bs.to=0, verbose=1)
cmd <- hivc.cmd.hpcwrapper(cmd, hpc.walltime=21, hpc.q= NA, hpc.mem="450mb", hpc.nproc=1)
cmd.hpccaller(outdir, paste("exa",paste(strsplit(date(),split=' ')[[1]],collapse='_',sep=''),sep='.'), cmd)
Sys.sleep(1)
if(1)
{
#
# run ExaML on concatenated
#
seq <- cbind(df.seq.gag,df.seq.pol,df.seq.env)
tmp <- cbind(outgroup.seq.gag[,1:ncol(df.seq.gag)], outgroup.seq.pol, outgroup.seq.env)
seq <- rbind(seq,tmp)
infile.seq.sig <- "Sun_Sep_14_12:59:06_2013"
infile.seq <- paste(substr(infile,1,nchar(infile)-20),'INFO_simu_concseq',sep='')
file <- paste( outdir, '/', infile.seq,'_',gsub('/',':',infile.seq.sig),'.R', sep='' )
save(seq, file=file)
# run ExaML
cmd <- hivc.cmd.examl.bootstrap.on.one.machine(indir, infile.seq, infile.seq.sig, infile.seq.sig, bs.from=0, bs.to=0, verbose=1)
cmd <- hivc.cmd.hpcwrapper(cmd, hpc.walltime=21, hpc.q= NA, hpc.mem="450mb", hpc.nproc=1)
cmd.hpccaller(outdir, paste("exa",paste(strsplit(date(),split=' ')[[1]],collapse='_',sep=''),sep='.'), cmd)
Sys.sleep(1)
}
}
#
# evaluate R2 for pol
#
gene <- 'pol'
infiles <- list.files(indir, paste('^ExaML_result.*',gene,'seq.*finaltree.000$',sep=''), full.names=FALSE)
for(i in seq_along(infiles))
{
# read files
infile <- infiles[i]
file <- paste(indir,'/',infile,sep='')
ph <- read.tree(file)
tmp <- regmatches(infile,regexpr('.*_INFO',infile))
file <- paste(indir,'/', substr(tmp, 14, nchar(tmp)-4),"SIMULATED_INTERNAL.R",sep='')
#tmp <- substring(infile, 14)
#tmp <- list.files(indir, paste(substr(tmp, 1, regexpr('_INFO',tmp) ),'.*INTERNAL.R$', sep=''), full.names=FALSE)
#cat(paste('\nLoad file=',tmp))
load(file)
# re-root at HXB2 (outgroup, because the simulated sequences are HIV-1C)
tmp <- which(ph$tip.label=="HXB2")
ph <- reroot(ph, tmp, ph$edge.length[which(ph$edge[,2]==tmp)])
ph <- ladderize(ph)
file <- paste( outdir, '/', substr(infile,1,nchar(infile)-20),'INFO_simu_ExaML',gene,'.pdf', sep='' )
pdf(file=file, w=10, h=150)
plot(ph, show.tip=TRUE, cex=0.5)
add.scale.bar()
dev.off()
# get root to tip divergence
ph <- drop.tip(ph,'HXB2')
file <- paste( outdir, '/', substr(infile,1,nchar(infile)-20),'INFO_simu_ExaML',gene,'.newick', sep='' )
write.tree(file=file, ph)
tmp <- node.depth.edgelength(ph)
ph.info <- data.table(LABEL=ph$tip.label, ROOT2TIP=tmp[seq_len(Ntip(ph))] )
set(ph.info, NULL, 'IDPOP', ph.info[, sapply(strsplit(LABEL, tree.id.labelsep, fixed=TRUE),'[[',1) ])
set(ph.info, NULL, 'IDPOP', ph.info[, as.integer(substr(IDPOP, 7, nchar(IDPOP)))])
ph.info <- merge(ph.info, subset(df.inds, select=c(IDPOP, IDCLU)), by='IDPOP')
set(ph.info, NULL, 'IDCLU', ph.info[, factor(IDCLU)])
set(ph.info, NULL, 'CALENDAR_TIME', ph.info[, as.numeric(sapply(strsplit(LABEL, tree.id.labelsep, fixed=TRUE),'[[',tree.id.label.idx.ctime))] )
tmp <- lm(ROOT2TIP~CALENDAR_TIME, data=ph.info)
set( ph.info, NULL, 'ROOT2TIP_LM', predict(tmp, type='response') )
tmp2 <- c( R2=round(summary(tmp)$r.squared,d=3),
SLOPE= as.numeric(round(coef(tmp)['CALENDAR_TIME'],d=4)),
TMRCA=as.numeric(round( -coef(tmp)['(Intercept)']/coef(tmp)['CALENDAR_TIME'], d=1 )) )
ggplot(ph.info, aes(x=CALENDAR_TIME, y=ROOT2TIP, colour=IDCLU)) + geom_point(alpha=0.75) + geom_line(alpha=0.1, aes(group=IDCLU)) + geom_line(aes(y=ROOT2TIP_LM)) +
#scale_x_continuous(breaks=seq(1980,2020,2)) +
scale_colour_discrete(guide=FALSE) +
labs(x='Sequence sampling date', y=paste('root-to-tip divergence\n(HIV-1',gene,'sequences)')) +
annotate("text", x=ph.info[, min(CALENDAR_TIME)], y=ph.info[, 0.9*max(ROOT2TIP)], label=paste("R2=", tmp2['R2'],'\nSlope=',tmp2['SLOPE'],'\nTMRCA=',tmp2['TMRCA'], sep=''), hjust = 0, size = 4) +
theme(legend.position=c(0,1), legend.justification=c(0,1))
file <- paste( outdir, '/', substr(infile,1,nchar(infile)-20),'INFO_simu_ExaML',gene,'R2.pdf', sep='' )
ggsave(file=file, w=10, h=6)
#
if(0)
{
nclu <- subset(df.inds, !is.na(IDCLU))[, length(unique(IDCLU))]
cat(paste('\nNumber of transmission clusters, n=', nclu))
tmp <- hivc.clu.brdist.stats(ph, eval.dist.btw="leaf", stat.fun=hivc.clu.min.transmission.cascade)
# almost random choice: 0.05
clustering <- hivc.clu.clusterbythresh(ph, thresh.brl=0.05, dist.brl=tmp, retval="all")
tmp <- data.table( LABEL=ph$tip.label, EXACLUID= clustering$clu.mem[ seq_len(Ntip(ph))] )
tmp[, IDPOP:= tmp[, as.integer(substring(sapply(strsplit(LABEL,'|',fixed=TRUE),'[[',1),7))]]
ph.inds <- merge( df.inds, tmp, by='IDPOP' )
tmp <- subset(ph.inds, !is.na(EXACLUID))[, {
tmp <- extract.clade(ph, hivc.clu.mrca(ph, LABEL)$mrca, root.edge=1)
tmp$root.edge <- 0
list( LABEL=tmp$tip.label, ROOT2TIP=node.depth.edgelength(tmp)[seq_len(Ntip(tmp))] )
}, by='EXACLUID']
ph.inds <- merge(ph.inds, tmp, by=c('EXACLUID','LABEL'), all.x=TRUE)
ph.inds <- merge(ph.inds, subset(ph.inds, !is.na(EXACLUID))[, list(EXACLUID_N=length(IDPOP)), by='EXACLUID'], by='EXACLUID', all.x=TRUE)
clx.info <- subset(ph.inds, EXACLUID_N==max(EXACLUID_N, na.rm=1))
ggplot(clx.info, aes(x=TIME_SEQ, y=ROOT2TIP, colour=IDCLU)) + geom_point(alpha=0.75)
}
}
}
##--------------------------------------------------------------------------------------------------------
## check ancestral sequences from BEAST XML, create random draw to check
##--------------------------------------------------------------------------------------------------------
project.PANGEA.RootSeqSim.SIMU.SSAfg.checkancestralseq.createdataset<- function()
{
require(phytools)
tree.id.burnin <- 2e7
tree.id.labelsep <- '|'
tree.id.label.idx.ctime <- 4
dir.name <- '/Users/Oliver/duke/2014_Gates'
indir <- paste(dir.name,'methods_comparison_rootseqsim/140813',sep='/')
infile <- "PANGEA_SSAfgBwhRc-_140811_n390_AncSeq.R"
outdir <- indir
outfile.fg <- infile
outfile.partialpol <- "PANGEA_SSApolBwhRc-_140811_n390_AncSeq.R"
outsignat <- "Mon_Aug_17_17:05:23_2014"
# load ancestral sequences
load( paste(indir, infile, sep='/') ) # expect anc.seq.gag, anc.seq.pol, anc.seq.env, anc.seq.info
# load aligned HXB2 as outgroup
load('/Users/Oliver/git/HPTN071sim/data_rootseq/PANGEA_SSAfg_140806_HXB2outgroup.R') #expect "outgroup.seq.gag" "outgroup.seq.pol" "outgroup.seq.env"
#
# basic checks
#
if(0)
{
tmp <- anc.seq.info[, sapply( strsplit(LABEL, tree.id.labelsep, fixed=TRUE),'[[', 1) ]
anc.seq.info[, POOL:= regmatches(tmp, regexpr('pool[0-9]+', tmp))]
set(anc.seq.info, NULL, 'POOL', anc.seq.info[, as.numeric(substr(POOL, 5, nchar(POOL)))])
ggplot(anc.seq.info, aes(x=CALENDAR_TIME)) + geom_histogram(binwidth=2) + facet_grid(.~POOL, margins=0)
file <- paste(indir, '/', substr(infile,1,nchar(infile)-2), '_calendartime.pdf' , sep='')
ggsave(file=file, w=8, h=6)
# most sequences between 1940 - 1980
subset(anc.seq.info, floor(CALENDAR_TIME)==1940)
}
# sample data set to run ExaML
anc.seq.info[, CALENDAR_YR:=anc.seq.info[, floor(CALENDAR_TIME)]]
# ~ 1400 sequences from 1940
# sample 10 times 1e3 sequences randomly from exp increasing prevalence and estimate tree to calculate root to tip divergence + clustering on fg and partial pol
s.seed <- 42
s.PREV.MAX <- 0.25
s.PREV.MIN <- 0.01
s.RANGE <- 40
s.size <- 1e3
s.baseline.ancseq.time <- 1940
s.baseline.calendar.time <- 1980
s.LENGTH.PARTIAL.POL <- 1500
tree.id.labelidx.ctime <- 4
tmp <- log( 1+s.PREV.MAX-s.PREV.MIN ) / (s.RANGE-1)
tmp <- exp( tmp*seq.int(0,s.RANGE-1) ) - 1 + s.PREV.MIN
seq.s <- c(s.PREV.MIN, diff(tmp))
seq.s <- data.table( SEQ_N= round( seq.s/s.PREV.MAX*s.size ), ANCSEQ_YR=seq_along(seq.s)+s.baseline.ancseq.time-1 )
anc.seq.info <- subset(anc.seq.info, CALENDAR_YR>=s.baseline.ancseq.time & CALENDAR_YR<=(s.baseline.ancseq.time+s.RANGE-1))
# draw partial pol genome sequences - length is first 1500 sites
# and draw full genome sequences
set.seed(s.seed)
for(check.draw in 1:5)
{
cat(paste('\nprocess checkdraw',check.draw))
# draw a large enough number of ancseq labels from the 3 pools for each year to accommodate SEQ_N/3 anc seqs from each pool
anc.seq.infodraw <- anc.seq.info[, {
tmp <- seq.s$SEQ_N[ which(seq.s$ANCSEQ_YR==CALENDAR_YR) ]
list(LABEL= sample(LABEL, 2*tmp, replace=FALSE), SEQ_N=tmp, SEQ_N_GRACE=2*tmp)
}, by='CALENDAR_YR']
anc.seq.draw <- anc.seq.pol[anc.seq.infodraw[, LABEL], seq_len(s.LENGTH.PARTIAL.POL)]
#
# make sure the drawn sequences are unique on partial POL
#
anc.seq.draw <- seq.unique(anc.seq.draw)
anc.seq.infodraw <- merge( data.table(LABEL=rownames(anc.seq.draw)), anc.seq.infodraw, by='LABEL' )
stopifnot( anc.seq.infodraw[, list(SEQ_N_GRACE=length(LABEL), SEQ_N=SEQ_N[1]), by='CALENDAR_YR'][, all(SEQ_N_GRACE>=SEQ_N)] )
anc.seq.infodraw <- anc.seq.infodraw[, list(LABEL= LABEL[seq_len(SEQ_N[1])]), by='CALENDAR_YR']
# set new calendar time for sequences
set(anc.seq.infodraw, NULL, 'LABEL_NEW', anc.seq.infodraw[, as.numeric( sapply( strsplit(LABEL,tree.id.labelsep,fixed=TRUE), '[[', tree.id.labelidx.ctime) ) ])
set(anc.seq.infodraw, NULL, 'LABEL_NEW', anc.seq.infodraw[, LABEL_NEW-s.baseline.ancseq.time+s.baseline.calendar.time])
anc.seq.infodraw <- anc.seq.infodraw[, {
tmp <- strsplit(LABEL,tree.id.labelsep,fixed=TRUE)[[1]]
tmp[tree.id.labelidx.ctime] <- LABEL_NEW
list(LABEL_NEW=paste(tmp, collapse=tree.id.labelsep,sep=''))
}, by='LABEL']
setkey(anc.seq.infodraw, LABEL)
#
# select partial POL seqs
#
anc.seq.draw <- anc.seq.pol[anc.seq.infodraw[, LABEL], seq_len(s.LENGTH.PARTIAL.POL)]
rownames(anc.seq.draw) <- anc.seq.infodraw[ rownames(anc.seq.draw), ][, LABEL_NEW]
anc.seq.draw <- rbind(anc.seq.draw, outgroup.seq.pol[, seq_len(s.LENGTH.PARTIAL.POL)])
# save to file, including outgroup
file <- paste( outdir, '/', substr(outfile.partialpol,1,nchar(outfile.partialpol)-2),'_checkdraw', check.draw,'_', insignat, '.R', sep='' )
cat(paste('\nsave to file', file))
save(anc.seq.draw, file=file)
# get NJ tree with HXB2 outgroup
tmp <- dist.dna(anc.seq.draw, model='raw')
anc.seq.nj <- nj(tmp)
tmp <- which(anc.seq.nj$tip.label=="HXB2")
anc.seq.nj <- reroot(anc.seq.nj, tmp, anc.seq.nj$edge.length[which(anc.seq.nj$edge[,2]==tmp)])
anc.seq.nj <- ladderize(anc.seq.nj)
file <- paste( outdir, '/', substr(outfile.partialpol,1,nchar(outfile.partialpol)-2),'_checkdraw', check.draw,'_NJ.pdf', sep='' )
pdf(file=file, w=10, h=80)
plot(anc.seq.nj, show.tip=TRUE, cex=0.5)
dev.off()
# get root to tip divergence
anc.seq.nj <- drop.tip(anc.seq.nj,'HXB2')
tmp <- node.depth.edgelength(anc.seq.nj)
anc.seq.nj.info <- data.table(LABEL=anc.seq.nj$tip.label, ROOT2TIP=tmp[seq_len(Ntip(anc.seq.nj))] )
#tmp<- distRoot(anc.seq.nj)
#set(anc.seq.nj.info, NULL, 'distRoot', tmp)
set(anc.seq.nj.info, NULL, 'CALENDAR_TIME', anc.seq.nj.info[, as.numeric(sapply(strsplit(LABEL, tree.id.labelsep, fixed=TRUE),'[[',tree.id.label.idx.ctime))] )
tmp <- lm(ROOT2TIP~CALENDAR_TIME, data=anc.seq.nj.info)
set( anc.seq.nj.info, NULL, 'ROOT2TIP_LM', predict(tmp, type='response') )
tmp2 <- c( R2=round(summary(tmp)$r.squared,d=3), SLOPE= as.numeric(round(coef(tmp)['CALENDAR_TIME'],d=4)), TMRCA=as.numeric(round( -coef(tmp)['(Intercept)']/coef(tmp)['CALENDAR_TIME'], d=1 )) )
ggplot(anc.seq.nj.info, aes(x=CALENDAR_TIME, y=ROOT2TIP)) + geom_point(alpha=0.5) + geom_line(aes(y=ROOT2TIP_LM)) +
#scale_x_continuous(breaks=seq(1980,2020,2)) +
labs(x='Sequence sampling date', y='root-to-tip divergence') +
annotate("text", x=anc.seq.nj.info[, min(CALENDAR_TIME)], y=anc.seq.nj.info[, 0.9*max(ROOT2TIP)], label=paste("R2=", tmp2['R2'],'\nSlope=',tmp2['SLOPE'],'\nTMRCA=',tmp2['TMRCA'], sep=''), hjust = 0, size = 4) +
theme(legend.position=c(0,1), legend.justification=c(0,1))
file <- paste( outdir, '/', substr(outfile.partialpol,1,nchar(outfile.partialpol)-2),'_checkdraw', check.draw,'_Root2Tip.pdf', sep='' )
ggsave(file=file, w=10, h=6)
#
# select the same full genome seqs plus aligned HXB2 as outgroup
#
anc.seq.draw <- do.call( 'cbind', list( anc.seq.gag[anc.seq.infodraw[, LABEL], ], anc.seq.pol[anc.seq.infodraw[, LABEL], ], anc.seq.env[anc.seq.infodraw[, LABEL], ] ) )
rownames(anc.seq.draw) <- anc.seq.infodraw[ rownames(anc.seq.draw), ][, LABEL_NEW]
anc.seq.draw <- rbind(anc.seq.draw, cbind(outgroup.seq.gag[,seq_len(ncol(anc.seq.gag))], outgroup.seq.pol, outgroup.seq.env))
# get NJ tree
tmp <- dist.dna(anc.seq.draw, model='raw')
anc.seq.nj <- nj(tmp)
tmp <- which(anc.seq.nj$tip.label=="HXB2")
anc.seq.nj <- reroot(anc.seq.nj, tmp, anc.seq.nj$edge.length[which(anc.seq.nj$edge[,2]==tmp)])
anc.seq.nj <- ladderize(anc.seq.nj)
file <- paste( outdir, '/', substr(outfile.fg,1,nchar(outfile.fg)-2),'_checkdraw', check.draw,'_NJ.pdf', sep='' )
pdf(file=file, w=10, h=80)
plot(anc.seq.nj, show.tip=TRUE, cex=0.5)
dev.off()
# get root to tip divergence
anc.seq.nj <- drop.tip(anc.seq.nj,'HXB2')
tmp <- node.depth.edgelength(anc.seq.nj)
anc.seq.nj.info <- data.table(LABEL=anc.seq.nj$tip.label, ROOT2TIP=tmp[seq_len(Ntip(anc.seq.nj))] )
#tmp<- distRoot(anc.seq.nj)
#set(anc.seq.nj.info, NULL, 'distRoot', tmp)
set(anc.seq.nj.info, NULL, 'CALENDAR_TIME', anc.seq.nj.info[, as.numeric(sapply(strsplit(LABEL, tree.id.labelsep, fixed=TRUE),'[[',tree.id.label.idx.ctime))] )
tmp <- lm(ROOT2TIP~CALENDAR_TIME, data=anc.seq.nj.info)
set( anc.seq.nj.info, NULL, 'ROOT2TIP_LM', predict(tmp, type='response') )
tmp2 <- c( R2=round(summary(tmp)$r.squared,d=3), SLOPE= as.numeric(round(coef(tmp)['CALENDAR_TIME'],d=4)), TMRCA=as.numeric(round( -coef(tmp)['(Intercept)']/coef(tmp)['CALENDAR_TIME'], d=1 )) )
ggplot(anc.seq.nj.info, aes(x=CALENDAR_TIME, y=ROOT2TIP)) + geom_point(alpha=0.5) + geom_line(aes(y=ROOT2TIP_LM)) +
#scale_x_continuous(breaks=seq(1980,2020,2)) +
labs(x='Sequence sampling date', y='root-to-tip divergence') +
annotate("text", x=anc.seq.nj.info[, min(CALENDAR_TIME)], y=anc.seq.nj.info[, 0.9*max(ROOT2TIP)], label=paste("R2=", tmp2['R2'],'\nSlope=',tmp2['SLOPE'],'\nTMRCA=',tmp2['TMRCA'], sep=''), hjust = 0, size = 4) +
theme(legend.position=c(0,1), legend.justification=c(0,1))
file <- paste( outdir, '/', substr(outfile.fg,1,nchar(outfile.fg)-2),'_checkdraw', check.draw, '_Root2Tip.pdf', sep='' )
ggsave(file=file, w=10, h=6)
# save to file including outgroup
file <- paste( outdir, '/', substr(outfile.fg,1,nchar(outfile.fg)-2),'_checkdraw', check.draw,'_',insignat,'.R', sep='' )
cat(paste('\nsave to file', file))
save(anc.seq.draw, file=file)
}
}
##--------------------------------------------------------------------------------------------------------
## check ancestral sequences from BEAST XML, run ExaML
##--------------------------------------------------------------------------------------------------------
project.PANGEA.RootSeqSim.SIMU.SSAfg.checkancestralseq.runExaML<- function()
{
#DATA <<- "/work/or105/Gates_2014"
#DATA <<- '/Users/Oliver/duke/2014_Gates'
dir.name <- DATA
indir <- paste(dir.name,'methods_comparison_rootseqsim/140813',sep='/')
# ExaML bootstrap args
bs.from <- 0
bs.to <- 1
bs.n <- 100
# search for 'checkdraw' files
infiles <- list.files(indir)
infiles <- infiles[ sapply(infiles, function(x) grepl('.*checkdraw[0-9]+.*R$',x) ) ]
if(!length(infiles)) stop('cannot find files matching criteria')
outdir <- indir
for(i in seq_along(infiles))
{
infile <- infiles[i]
infile <- substr(infile, 1, nchar(infile)-2)
insignat <- regmatches(infile, regexpr('checkdraw[0-9]+_.*', infile))
insignat <- regmatches(insignat,regexpr('_.*',insignat))
insignat <- substr(insignat,2,nchar(insignat))
infile <- regmatches(infile, regexpr('.*checkdraw[0-9]+', infile))
cmd <- hivc.cmd.examl.bootstrap(indir, infile, insignat, insignat, bs.from=bs.from, bs.to=bs.to,bs.n=bs.n,outdir=outdir, resume=1, verbose=1)
dummy <- lapply(cmd, function(x)
{
x <- hivc.cmd.hpcwrapper(x, hpc.walltime=24, hpc.q= NA, hpc.mem="450mb", hpc.nproc=1)
#x <- hivc.cmd.hpcwrapper(x, hpc.walltime=24, hpc.q="pqeph", hpc.mem="3850mb", hpc.nproc=8)
signat <- paste(strsplit(date(),split=' ')[[1]],collapse='_',sep='')
outfile <- paste("exa",signat,sep='.')
#cat(x)
hivc.cmd.hpccaller(outdir, outfile, x)
Sys.sleep(1)
})
stop()
}
}
##--------------------------------------------------------------------------------------------------------
## check ancestral sequences from BEAST XML, run ExaML
##--------------------------------------------------------------------------------------------------------
project.PANGEA.RootSeqSim.SIMU.SSAfg.checkancestralseq.evalExaML<- function()
{
require(ape)
require(phytools)
require(ggplot2)
#DATA <<- "/work/or105/Gates_2014"
#DATA <<- '/Users/Oliver/duke/2014_Gates'
dir.name <- DATA
indir <- paste(dir.name,'methods_comparison_rootseqsim/140813',sep='/')
label.sep <- '|'
label.idx.ctime <- 4
clu.thresh.bs <- 0.9
clu.thresh.brl <- 0.045 #0.035
bs.n <- 100
# search for 'checkdraw examl' files
infiles <- list.files(indir)
infiles <- infiles[ grepl('.*checkdraw[0-9]+_examl.*newick$',infiles) ]
if(!length(infiles)) stop('cannot find files matching criteria')
# unexpected: R2 differs largely from NJ tree to ExaML tree, especially for i=3
for(i in seq_along(infiles))
{
#i <- 3
infile <- infiles[i]
file <- paste(indir, infile, sep='/')
ph <- read.tree(file)
tmp <- which(ph$tip.label=="HXB2")
ph <- reroot(ph, tmp, ph$edge.length[which(ph$edge[,2]==tmp)])
ph <- ladderize(ph)
#
file <- paste( indir, '/', substr(infile, 1, nchar(infile)-7), '_Tree.pdf', sep='' )
pdf(file=file, h=150, w=10)
plot(ph, cex=0.7)
dev.off()
#
# check root to tip divergence
#
ph <- drop.tip(ph,'HXB2')
tmp <- node.depth.edgelength(ph)
ph.info <- data.table(LABEL=ph$tip.label, ROOT2TIP=tmp[seq_len(Ntip(ph))] )
set(ph.info, NULL, 'CALENDAR_TIME', ph.info[, as.numeric(sapply(strsplit(LABEL, label.sep, fixed=TRUE),'[[',label.idx.ctime))] )
tmp <- lm(ROOT2TIP~CALENDAR_TIME, data=ph.info)
set( ph.info, NULL, 'ROOT2TIP_LM', predict(tmp, type='response') )
tmp2 <- c( R2=round(summary(tmp)$r.squared,d=3), SLOPE= as.numeric(round(coef(tmp)['CALENDAR_TIME'],d=4)), TMRCA=as.numeric(round( -coef(tmp)['(Intercept)']/coef(tmp)['CALENDAR_TIME'], d=1 )) )
ggplot(ph.info, aes(x=CALENDAR_TIME, y=ROOT2TIP)) + geom_point(alpha=0.5) + geom_line(aes(y=ROOT2TIP_LM)) +
#scale_x_continuous(breaks=seq(1980,2020,2)) +
labs(x='Sequence sampling date', y='root-to-tip divergence') +
annotate("text", x=ph.info[, min(CALENDAR_TIME)], y=ph.info[, 0.9*max(ROOT2TIP)], label=paste("R2=", tmp2['R2'],'\nSlope=',tmp2['SLOPE'],'\nTMRCA=',tmp2['TMRCA'], sep=''), hjust = 0, size = 4) +
theme(legend.position=c(0,1), legend.justification=c(0,1))
file <- paste( indir, '/', substr(infile, 1, nchar(infile)-7), '_Root2Tip.pdf', sep='' )
ggsave(file=file, w=10, h=6)
if(0)
{
# check brl divergence
#brl.tips <- distTips(ph , method='patristic')
#
# check clustering among root seqs - this should be minimal at cut-off 0.045
#
dist.brl <- hivc.clu.brdist.stats(ph, eval.dist.btw="leaf", stat.fun=hivc.clu.min.transmission.cascade )
quantile(dist.brl,p=c(0.01,0.05,0.1,0.2,0.3,0.4,0.5))
hist(dist.brl, breaks=30)
ph.node.bs <- as.numeric( ph$node.label )
ph.node.bs[is.na(ph.node.bs)] <- 0
ph.node.bs <- ph.node.bs/bs.n
ph$node.label <- ph.node.bs
ph.clu <- hivc.clu.clusterbythresh(ph, thresh.nodesupport=clu.thresh.bs, thresh.brl=clu.thresh.brl, dist.brl=dist.brl, nodesupport=ph.node.bs, retval="all")
# plot clustering tree
file <- paste( indir, '/', substr(infile, 1, nchar(infile)-7), '_Clutree.pdf', sep='' )
pdf(file=file, h=150, w=10)
tmp <- hivc.clu.plot(ph, ph.clu[["clu.mem"]], show.tip.label=TRUE, cex.edge.incluster=1.5)
dev.off()
}
}
# consider all bootstrap trees
indirs <- c( '/Users/Oliver/duke/2014_Gates/methods_comparison_rootseqsim/140813/PANGEA_SSApolBwhRc-_140811_n390_AncSeq_checkdraw1_examlout_Mon_Aug_17_17:05:23_2014',
'/Users/Oliver/duke/2014_Gates/methods_comparison_rootseqsim/140813/PANGEA_SSApolBwhRc-_140811_n390_AncSeq_checkdraw2_examlout_Mon_Aug_17_17:05:23_2014',
'/Users/Oliver/duke/2014_Gates/methods_comparison_rootseqsim/140813/PANGEA_SSApolBwhRc-_140811_n390_AncSeq_checkdraw3_examlout_Mon_Aug_17_17:05:23_2014' )
df.r2t <- lapply(seq_along(indirs), function(i)
{
indir <- indirs[i]
infiles <- list.files(indir)
infiles <- infiles[ grepl('^ExaML_result.*finaltree*',infiles) ]
if(!length(infiles)) stop('cannot find files matching criteria')
df.r2t <- lapply(seq_along(infiles), function(j)
{
infile <- infiles[j]
cat(paste('\nprocess file', infile))
file <- paste(indir, infile, sep='/')
ph <- read.tree(file)
tmp <- which(ph$tip.label=="HXB2")
ph <- reroot(ph, tmp, ph$edge.length[which(ph$edge[,2]==tmp)])
ph <- ladderize(ph)
ph <- drop.tip(ph,'HXB2')
tmp <- node.depth.edgelength(ph)
ph.info <- data.table(LABEL=ph$tip.label, ROOT2TIP=tmp[seq_len(Ntip(ph))] )
set(ph.info, NULL, 'CALENDAR_TIME', ph.info[, as.numeric(sapply(strsplit(LABEL, label.sep, fixed=TRUE),'[[',label.idx.ctime))] )
set(ph.info, NULL, 'BS', as.numeric(substr(infile, nchar(infile)-2, nchar(infile))) )
ph.info
})
df.r2t <- do.call('rbind',df.r2t)
set(df.r2t, NULL, 'FILE', i)
df.r2t
})
df.r2t <- do.call('rbind',df.r2t)
# the EXAML root to tip divergence is highly variable!
#ggplot( subset( df.r2t, LABEL=='PANGEA_SSAfgBwhRc-_140811_n390_pool1|STATE_20400000|250|1987.78' ), aes(x=ROOT2TIP)) + geom_histogram()
# get R2 across all bootstrap ExaML tree
df.r2t.su<- df.r2t[, {
tmp <- lm(ROOT2TIP ~ CALENDAR_TIME)
list( R2=round(summary(tmp)$r.squared,d=3) )
},by=c('BS','FILE')]
set(df.r2t.su, NULL, 'FILE', df.r2t.su[, factor(FILE)])
ggplot( df.r2t.su, aes(x=R2, fill=FILE)) + geom_histogram(binwidth=0.05) + facet_grid(.~FILE) +
labs(fill='draw of 1000\nancestral\nsequences', x='R2 of lm(ROOT2TIP ~ CALENDAR_TIME)')
indir <- paste(dir.name,'methods_comparison_rootseqsim/140813',sep='/')
file <- paste(indir, '/','PANGEA_SSApolBwhRc-_140811_n390_AncSeq_checkdraw1-3_examlout.pdf', sep='')
ggsave(file=file, w=10, h=5)
}
##--------------------------------------------------------------------------------------------------------
## get anecestral sequences from BEAST XML - developper version
##--------------------------------------------------------------------------------------------------------
project.PANGEA.RootSeqSim.BEAST.devel.getancestralseq.from.output<- function()
{
#dir.name <- "/work/or105/Gates_2014"
dir.name <- '/Users/Oliver/duke/2014_Gates'
if(1) # devel
{
indir <- paste(dir.name,'methods_comparison_rootseqsim/140730',sep='/')
outdir <- indir
infile.xml <- 'working.xml'
infile.beastparsed <- 'working.R'
outfile <- 'working_ancseq.R'
# load BEAST PARSER output
file <- paste(indir, '/',infile.beastparsed, sep='')
load(file) # expect tree, node.stat
# get original sequences
file <- paste(indir, '/',infile.xml, sep='')
bxml <- xmlTreeParse(file, useInternalNodes=TRUE, addFinalizer = TRUE)
bseq <- hivc.beast.get.sequences(bxml, verbose=1)
bseq <- merge(bseq, data.table(ALIGNMENT_ID=paste('alignment',1:3,sep=''), GENE=c('env','gag','pol')), by='ALIGNMENT_ID')
# compute gag pol env
tmp <- PANGEA.RootSeqSim.get.ancestral.seq(tree, node.stat, bseq, tree.id.sep='_', tree.id.idx.mcmcit=2, tree.id.burnin=1, label.sep='|', label.idx.ctime=2)
ancseq.gag <- tmp$GAG
ancseq.env <- tmp$ENV
ancseq.pol <- tmp$POL
# save as R
file <- paste(outdir, outfile, sep='/')
save(ancseq.gag, ancseq.env, ancseq.pol, file=file)
# save as FASTA
file <- paste(outdir, paste(substr(outfile,1,nchar(outfile)-1),'fasta',sep=''), sep='/')
write.dna(cbind(ancseq.gag, ancseq.env, ancseq.pol), file, format = "fasta")
#
# sample ancestral sequences between 1980-2000 and reconstruct tree with RAxML
#
ancseq <- cbind(ancseq.gag, ancseq.env, ancseq.pol)
label.sep <- '|'
label.idx.tree.id <- 1
label.idx.node.id <- 2
label.idx.ctime <- 3
ancseq.label <- data.table( TREE_ID= sapply(strsplit(rownames(ancseq), label.sep, fixed=1),'[[',label.idx.tree.id),
NODE_ID= as.numeric(sapply(strsplit(rownames(ancseq), label.sep, fixed=1),'[[',label.idx.node.id)),
CALENDAR_TIME= as.numeric(sapply(strsplit(rownames(ancseq), label.sep, fixed=1),'[[',label.idx.ctime)))
hist( ancseq.label[, CALENDAR_TIME], breaks=100 )
}
if(0)
{
tmp <- subset(node.stat, select=c(TREE_ID, NODE_ID, CALENDAR_TIME))
setkey(tmp, TREE_ID, NODE_ID)
hist(tmp[, CALENDAR_TIME], breaks=seq(1930,2011,1))
# reconstruct gene sequences and store in ape format
# get calendar time for gene sequence
set(node.stat, NULL, 'VALUE', node.stat[, gsub('\"','',VALUE)])
# check seq lengths
tmp <- node.stat[, list(NCHAR=nchar(VALUE)), by=c('STAT','NODE_ID','TREE_ID')]
stopifnot( tmp[, list(CHECK=all(NCHAR[1]==NCHAR)), by='STAT'][, all(CHECK)] )
tmp[, list(CHECK=unique(NCHAR)), by='STAT']
ENV.CP1<- "AGA"
ENV.CP2<- "XYZ"
ENV.CP3<- "KLM"
tmp <- do.call('rbind',sapply(list(ENV.CP1,ENV.CP2,ENV.CP3), strsplit, ''))
tmp <- paste(as.vector(tmp), collapse='')
subset(node.stat, TREE_ID=='STATE_0' & NODE_ID==which(btree[[1]]$tip.label=='C.BW.AF443074|1996'))
}
if(0) #devel
{
dir.name <- '/Users/Oliver/duke/2014_Gates'
indir <- paste(dir.name,'methods_comparison_rootseqsim/140801',sep='/')
infile <- 'ALLv06.n97.rlx.gmrf'
insignat <- 'Sun_Jul_27_09-00-00_2014'
file <- paste(indir, '/', infile, '_', insignat, '.timetrees', sep='')
indir <- paste(dir.name,'methods_comparison_rootseqsim/140730',sep='/')
infile.timetrees <- 'working.timetrees'
outdir <- indir
file <- paste(indir, '/', infile.timetrees, sep='')
tmp <- hivc.beast2out.read.nexus.and.stats(file, tree.id=NA, method.node.stat='any.node')
tree <- tmp$tree
node.stat <- tmp$node.stat
file <- paste(indir,'/',paste(substr(infile.timetrees,1,nchar(infile.timetrees)-9),'R',sep=''),sep='')
save(tree, node.stat, file=file)
}
}
##--------------------------------------------------------------------------------------------------------
## prepare LANL download
## downloaded 556 full genome sequences from SSA on 06-08-2014
##--------------------------------------------------------------------------------------------------------
project.PANGEA.RootSeqSim.DATA.SSAfg.process.LosAlamos<- function()
{
#DATA <<- '/Users/Oliver/duke/2014_Gates'
indir <- paste(DATA,'methods_comparison_rootseqsim/140806',sep='/')
infile.fasta <- 'PANGEA_SSAfg_140806_n557.fasta'
infile.fasta.gag<- 'PANGEA_SSAfg_gag_140806_n557.fasta'
infile.fasta.pol<- 'PANGEA_SSAfg_pol_140806_n557.fasta'
infile.fasta.env<- 'PANGEA_SSAfg_env_140806_n557.fasta'
outfile.outgroup<- 'PANGEA_SSAfg_140806_HXB2outgroup.R'
file <- paste(indir, '/', infile.fasta,sep='')
seq <- read.dna(file, format='fasta')
label.sep <- '|'
label.idx.country.id <- 2
label.idx.label <- 3
label.idx.ctime <- 4
tmp <- strsplit(names(seq), label.sep, fixed=1)
seq.label <- data.table( LABEL= names(seq),
COUNTRY_ID= sapply(tmp,'[[',label.idx.country.id),
NAME= sapply(strsplit(names(seq), label.sep, fixed=1),'[[',label.idx.label),
CALENDAR_TIME= sapply(strsplit(names(seq), label.sep, fixed=1),'[[',label.idx.ctime))
# remove sequences without a sampling date
seq.label <- subset( seq.label, !is.na(as.numeric(CALENDAR_TIME)) )
set(seq.label, NULL, 'CALENDAR_TIME', seq.label[, as.numeric(CALENDAR_TIME)])
# histogram of sample dates and sample location
ggplot(seq.label, aes(x=CALENDAR_TIME)) + geom_histogram(binwidth=1)
ggplot(seq.label, aes(x=COUNTRY_ID)) + geom_histogram()
#
# some seqs are crap and have duplicate runs, need to edit manually. ISOLATING GAG POL ENV
#
data(refseq_hiv1_hxb2)
seq.gag.c <- as.character(read.dna(file='~/git/hivclust/pkg/data/LosAlamos_HIV1B_CONSENSUS_2004_gag_DNA.fasta', format='fasta'))
seq.pol.c <- as.character(read.dna(file='~/git/hivclust/pkg/data/LosAlamos_HIV1B_CONSENSUS_2004_pol_DNA.fasta', format='fasta'))
seq.env.c <- as.character(read.dna(file='~/git/hivclust/pkg/data/LosAlamos_HIV1B_CONSENSUS_2004_env_DNA.fasta', format='fasta'))
# located gag start at 1009 manually, rm everything before
seq <- as.character(seq)
for(i in seq_along(seq)) seq[[i]][1:1008]<- "-"
tmp <- c(1041:1043, 1218:1220, 1321:1322, 1326, 1341:1352, 1366:1389, 1395:1397, 1402:1407, 1411:1413, 1417:1440, 1443:1445, 1447:1449, 1896:1898, 2271:2291, 2455:2466, 2507:2509,
2511:2513, 2521:2523, 2551:2577, 2596, 2640, 2644:2655, 2668:2670 )
for(i in seq_along(seq)) seq[[i]][tmp] <- '-'
# add '-' to get all seqs of same length
tmp <- max(sapply(seq, length))
seq <- lapply(seq, function(x) c(x, rep('-',tmp-length(x))) )
# get into matrix DNAbin
seq <- as.DNAbin(do.call('rbind', seq))
# rm '-' columns so far
seq <- seq.rmallchar(seq, rm.char='-', verbose=1)
file <- paste(indir, '/fixup1_', infile.fasta,sep='')
write.dna(seq, file=file, format='fasta')
# located pol/prot start at 1657 manually, cut and rm gaps and re-align gag
seq <- as.character(seq)
seq.gag <- seq[, 1:1656]
tmp <- rownames(seq.gag)
seq.gag <- lapply(seq_len(nrow(seq.gag)), function(i){ seq.gag[i, seq.gag[i,]!="-" ] })
seq.gag[[length(seq.gag)+1]] <- hxb2[, as.character(HXB2.K03455)]
seq.gag[[length(seq.gag)+1]] <- seq.gag.c['CONSENSUS_C',]
names(seq.gag) <- c(tmp,'HXB2','CONSENSUS_C')
seq.gag <- as.DNAbin(seq.gag)
file <- paste(indir, '/', infile.fasta.gag,sep='')
write.dna(seq.gag, file=file, format='fasta')
# align GAG
tmp <- hivc.cmd.clustalo(indir, infile.fasta.gag, signat='', outdir=indir, prog= PR.CLUSTALO, hmm=PR.CLUSTALO.HMM, nproc=1, verbose=1)
# continue fixup for POL
file <- paste(indir, '/fixup1_', infile.fasta,sep='')
seq <- read.dna(file, format='fasta')
seq <- as.character(seq)
seq[, 1:1656] <- "-"
# located end of POL at 4597 - found length issues, align with own HXB2
seq.pol <- seq[, 1:4700]
tmp <- rownames(seq.pol)
seq.pol <- lapply(seq_len(nrow(seq.pol)), function(i){ seq.pol[i, seq.pol[i,]!="-" ] })
seq.pol[[length(seq.pol)+1]] <- hxb2[, as.character(HXB2.K03455)]
seq.pol[[length(seq.pol)+1]] <- seq.pol.c['CONSENSUS_C',]
names(seq.pol) <- c(tmp,'HXB2','CONSENSUS_C')
seq.pol <- as.DNAbin( seq.pol )
file <- paste(indir, '/', infile.fasta.pol,sep='')
write.dna(seq.pol, file=file, format='fasta')
# align POL
# /Users/Oliver/git/hivclust/pkg/inst/mafft-mac/mafft.bat --op 1.8 --ep 0.4 --maxiterate 15 --thread 4 /Users/Oliver/duke/2014_Gates/methods_comparison_rootseqsim/140806/PANGEA_SSAfg_pol_140806_n557.fasta > /Users/Oliver/duke/2014_Gates/methods_comparison_rootseqsim/140806/PANGEA_SSAfg_pol_140806_n557.fasta.mafft
#tmp <- hivc.cmd.clustalo(indir, infile.fasta.pol, signat='', outdir=indir, prog= PR.CLUSTALO, hmm=PR.CLUSTALO.HMM, nproc=1, verbose=1)
#
# located start of ENV around 5918 which( seq[nrow(seq), 4597:ncol(seq)]=='-' ) which( seq[nrow(seq), 5918:ncol(seq)]=='-' ) -> 660 -> 9058
# located end of ENV around 9176
file <- paste(indir, '/fixup1_', infile.fasta,sep='')
seq <- read.dna(file, format='fasta')
seq <- as.character(seq)
seq.env <- seq[, 5850:9250]
tmp <- rownames(seq.env)
seq.env <- lapply(seq_len(nrow(seq.env)), function(i){ seq.env[i, seq.env[i,]!="-" ] })
seq.env[[length(seq.env)+1]] <- hxb2[, as.character(HXB2.K03455)]
seq.env[[length(seq.env)+1]] <- seq.env.c['CONSENSUS_C',]
names(seq.env) <- c(tmp,'HXB2','CONSENSUS_C')
seq.env <- as.DNAbin( seq.env )
file <- paste(indir, '/', infile.fasta.env,sep='')
write.dna(seq.env, file=file, format='fasta')
# align ENV
tmp <- hivc.cmd.clustalo(indir, infile.fasta.env, signat='', outdir=indir, prog= PR.CLUSTALO, hmm=PR.CLUSTALO.HMM, nproc=1, verbose=1)
#
# process GAG
#
file <- paste(indir, '/', infile.fasta.gag,'.mafft',sep='')
seq.gag <- read.dna(file, format='fasta')
seq.gag <- as.character(seq.gag)
seq.gag <- seq.gag[,790:2698]
tmp <- rownames(seq.gag)
seq.gag <- lapply(seq_len(nrow(seq.gag)), function(i){ seq.gag[i, seq.gag[i,]!="-" ] })
names(seq.gag) <- tmp
seq.gag <- as.DNAbin(seq.gag)
infile.fasta.gag<- 'PANGEA_SSAfg_gag2_140806_n557.fasta'
file <- paste(indir, '/', infile.fasta.gag,sep='')
write.dna(seq.gag, file=file, format='fasta')
#/Users/Oliver/git/hivclust/pkg/inst/mafft-mac/mafft.bat --op 3.0 --ep 1.5 --maxiterate 1000 --thread 4 /Users/Oliver/duke/2014_Gates/methods_comparison_rootseqsim/140806/PANGEA_SSAfg_gag2_140806_n557.fasta > /Users/Oliver/duke/2014_Gates/methods_comparison_rootseqsim/140806/PANGEA_SSAfg_gag2_140806_n557.fasta.mafft
# manual edits
file <- paste(indir, '/', infile.fasta.gag,'.mafft',sep='')
seq.gag <- read.dna(file, format='fasta')
seq.gag <- as.character(seq.gag)
seq.gag[, c(1361:1404, 1406:1413, 1421:1458)] <- '-'
seq.gag <- seq.rmgaps(seq.gag, rm.only.col.gaps=1, verbose=1)
infile.fasta.gag<- 'PANGEA_SSAfg_gag3_140806_n557.fasta'
file <- paste(indir, '/', infile.fasta.gag,sep='')
write.dna(seq.gag, file=file, format='fasta')
# final without HXB2 / CONSENSUS
seq.gag <- seq.rmgaps(seq.gag, rm.only.col.gaps=1, verbose=1) #len 1466 -- 3 more than expected; there is an AA insertion for the SA seqs at pos 367
outgroup.seq.gag<- seq.gag[ 'HXB2'==rownames(seq.gag), ]
seq.gag <- seq.gag[ -c( which(grepl('HXB2',rownames(seq.gag))), which(grepl('CONSENSUS',rownames(seq.gag))) ), ]
seq.gag <- seq.rmgaps(seq.gag, rm.only.col.gaps=1, verbose=2)
# based on verbose output, cut the following sites from outgroup.seq.gag
# 373:381, 1123:1125, 1401:1403, 1408:1410, 1449:1454
outgroup.seq.gag<- outgroup.seq.gag[, -c(373:381, 1123:1125, 1401:1403, 1408:1410, 1449:1454)]
infile.fasta.gag<- 'PANGEA_SSAfg_gag_140806_n556_final.fasta'
file <- paste(indir, '/', infile.fasta.gag,sep='')
write.dna(seq.gag, file=file, format='fasta')
#
# process POL
#
file <- paste(indir, '/', infile.fasta.pol,'.mafft',sep='')
seq.pol <- read.dna(file, format='fasta')
# start: 2253 end: 5096
seq.pol <- seq.pol[,2253:5096]
outgroup.seq.pol<- seq.pol[ 'HXB2'==rownames(seq.pol), ]
seq.pol <- seq.pol[ -c( which(grepl('HXB2',rownames(seq.pol))), which(grepl('CONSENSUS',rownames(seq.pol))) ), ]
infile.fasta.pol<- 'PANGEA_SSAfg_pol_140806_n556_final.fasta'
file <- paste(indir, '/', infile.fasta.pol,sep='')
write.dna(seq.pol, file=file, format='fasta')
#
# process ENV
#
# start 6216 end 9734
file <- paste(indir, '/', infile.fasta.env,'.mafft',sep='')
seq.env <- read.dna(file, format='fasta')
seq.env <- as.character(seq.env)
tmp <- names(seq.env)
seq.env <- t(sapply( seq_along(seq.env), function(i){ seq.env[[i]][ 6216:9734 ] }))
seq.env <- lapply(seq_len(nrow(seq.env)), function(i){ seq.env[i, seq.env[i,]!="-" ] })
names(seq.env) <- tmp
seq.env <- as.DNAbin(seq.env)
infile.fasta.env<- 'PANGEA_SSAfg_env2_140806_n557.fasta'
file <- paste(indir, '/', infile.fasta.env,sep='')
write.dna(seq.env, file=file, format='fasta')
# align on LosAlamos + manual edits
file <- paste(indir, '/', infile.fasta.env,'.hivalign',sep='')
seq.env <- read.dna(file, format='fasta')
seq.env <- as.character(seq.env)
seq.env[,c(1613:1645,2852:2932) ] <- '-'
seq.env <- seq.rmgaps(seq.env, rm.only.col.gaps=1, verbose=1)
infile.fasta.env<- 'PANGEA_SSAfg_env3_140806_n557.fasta'
file <- paste(indir, '/', infile.fasta.env,sep='')
write.dna(seq.env, file=file, format='fasta', colsep='', nbcol=-1)
# align on LosAlamos + manual edits
file <- paste(indir, '/', infile.fasta.env,'.hivalignnt',sep='')
seq.env <- read.dna(file, format='fasta')
seq.env <- as.character(seq.env)
tmp <- max(sapply(seq.env, length))
seq.env <- t(sapply(seq.env, function(x) c(x, rep('-',tmp-length(x))) ))
seq.env[,c(430:543) ] <- '-' #rm non-consensus jitter
seq.env[,c(1294:1344) ] <- '-' #rm non-consensus jitter
seq.env <- seq.rmgaps(seq.env, rm.only.col.gaps=1, verbose=1)
infile.fasta.env<- 'PANGEA_SSAfg_env4_140806_n557.fasta'
file <- paste(indir, '/', infile.fasta.env,sep='')
write.dna(seq.env, file=file, format='fasta', colsep='', nbcol=-1)
#seq.env[,c(1:9) ] <- '-' #C consensus start
#!! There are two extra AA that are not in consensus C TAGTAG at pos 561
#!! There are two extra AA that are not in consensus C at pos 2310
#fixed some final issues manually
seq.env <- read.dna(file, format='fasta')
outgroup.seq.env<- seq.env[ 'HXB2'==rownames(seq.env), ]
seq.env <- seq.env[ -c( which(grepl('HXB2',rownames(seq.env))), which(grepl('CONSENSUS',rownames(seq.env))) ), ]
seq.env <- seq.rmgaps(seq.env, rm.only.col.gaps=1, verbose=2)
# based on verbose output, cut the following sites from outgroup.seq.env
# 55:66, 919:924, 1057:1059
outgroup.seq.env<- outgroup.seq.env[, -c(55:66, 919:924, 1057:1059)]
infile.fasta.env<- 'PANGEA_SSAfg_env_140806_n556_final.fasta'
file <- paste(indir, '/', infile.fasta.env,sep='')
write.dna(seq.env, file=file, format='fasta', colsep='', nbcol=-1)
# write outgroups
file <- paste(indir, '/', outfile.outgroup, sep='')
save(file=file, outgroup.seq.gag, outgroup.seq.pol, outgroup.seq.env)
}
##--------------------------------------------------------------------------------------------------------
## get sequences from BEAST XML, create concatenated file
##--------------------------------------------------------------------------------------------------------
project.PANGEA.RootSeqSim.DATA.checkDrugResistanceMask<- function()
{
require(XML)
require(ape)
#DATA <<- "/work/or105/Gates_2014"
#DATA <<- '/Users/Oliver/duke/2014_Gates'
indir <- paste(DATA,'methods_comparison_rootseqsim/140727',sep='/')
outdir <- paste(DATA,'methods_comparison_rootseqsim/140728',sep='/')
infile <- 'ALLv02.n100.rlx.gmrf'
insignat <- 'Sun_Jul_27_09-00-00_2014'
file <- '/Users/Oliver/duke/2014_Gates/methods_comparison_rootseqsim/140727/ALLv01.n100.rlx.gmrf_Sun_Jul_27_09-00-00_2014.xml'
file <- '/Users/Oliver/duke/2014_Gates/methods_comparison_rootseqsim/140801/ALLv06.n97.rlx.gmrf_Sun_Jul_27_09-00-00_2014.xml'
bxml <- xmlTreeParse(file, useInternalNodes=TRUE, addFinalizer = TRUE)
bseq <- hivc.beast.get.sequences(bxml, verbose=1)
bseq <- merge(bseq, data.table(ALIGNMENT_ID=paste('alignment',1:3,sep=''), GENE=c('env','gag','pol')), by='ALIGNMENT_ID')
# check all seq of same length per gene
bseq <- merge(bseq, bseq[, list(SEQ_N=nchar(SEQ)), by=c('GENE','TAXON_ID')], by=c('GENE','TAXON_ID'))
stopifnot( bseq[, all(SEQ_N==SEQ_N[1]), by='GENE'][, all(V1)] )
# check 3 genes per taxon
stopifnot( bseq[, length(SEQ)==3, by='TAXON_ID'][, all(V1)] )
# check if indeed patterns are compressed
if(0)
{
bseq <- subset(bseq, GENE=='env')
tmp <- tolower(do.call('rbind',strsplit(bseq[, SEQ],'')))
bseq.CP1<- tmp[, seq.int(1, ncol(tmp), by=3)]
bseq.CP2<- tmp[, seq.int(2, ncol(tmp), by=3)]
bseq.CP3<- tmp[, seq.int(3, ncol(tmp), by=3)]
tmp <- bseq.CP3
tmp2 <- apply( tmp, 1, function(x) paste(x,sep='',collapse='')) #identical sequences?
cat(paste('\nunique sequences, n=',length(unique(tmp2))))
tmp2 <- apply( tmp, 2, function(x) paste(x,sep='',collapse='')) #identical patterns?
cat(paste('\nunique patterns, n=',length(unique(tmp2))))
tmp2 <- apply( tmp, 2, function(x) all(x==x[1])) #invariant sites?
length(which(tmp2))
}
# check for drug resistance mutations
if(0)
{
tmp <- subset(bseq, GENE=='pol')
bseq.pol.m <- do.call('rbind',strsplit(tmp[, SEQ],''))
file <- '/Users/Oliver/git/hivclust/pkg/data/IAS_primarydrugresistance_201303.rda'
alignment.start <- 2085
load(file)
IAS_primarydrugresistance_201303 <- as.data.table(IAS_primarydrugresistance_201303)
set(IAS_primarydrugresistance_201303, NULL, "Alignment.nuc.pos", IAS_primarydrugresistance_201303[,Alignment.nuc.pos]-alignment.start+1)
#pol not in HXB2 consensus coordinates - TODO would have to align against consensus
z<- seq.rm.drugresistance(bseq.pol.m, IAS_primarydrugresistance_201303, verbose=1, rtn.DNAbin=0)
}
# concatenate into single DNAbin matrix and save
tmp <- dcast.data.table(bseq, TAXON_ID ~ GENE, value.var="SEQ")
tmp[, SEQ_ALL:=paste(gag, pol, env, sep='')]
tmp2 <- tolower(do.call('rbind',strsplit(tmp[, SEQ_ALL],'')))
rownames(tmp2) <- tmp[, TAXON_ID]
seq <- as.DNAbin(tmp2)
outfile <- paste(infile,'_conc_',insignat,'.R',sep='')
save(seq, file=paste(outdir, outfile, sep='/'))
}
##--------------------------------------------------------------------------------------------------------
## check for recombinants
##--------------------------------------------------------------------------------------------------------
project.PANGEA.RootSeqSim.DATA.checkRecombinants<- function()
{
require(XML)
require(ape)
require(r3SEQ)
#DATA <<- "/work/or105/Gates_2014"
#DATA <<- '/Users/Oliver/duke/2014_Gates'
indir <- paste(DATA,'methods_comparison_rootseqsim/140728',sep='/')
infile <- 'ALLv02.n100.rlx.gmrf_conc'
outfile <- 'ALLv03.n97.rlx.gmrf'
insignat <- 'Sun_Jul_27_09-00-00_2014'
#file <- paste(indir, '/', infile, '_', insignat, '.R', sep='')
#load(file)
# run 3SEQ
infile.3seq <- paste(infile, '_', insignat, '.R', sep='')
pipeline.recom.run.3seq(indir, infile.3seq, batch.n=1, hpc.walltime=1, hpc.q=NA, hpc.mem="500mb", hpc.nproc=1)
# parse 3SEQ output
argv <<- cmd.recombination.process.3SEQ.output(indir, infile.3seq, '', resume=1, verbose=1)
argv <<- unlist(strsplit(argv,' '))
df.recomb <- prog.recom.process.3SEQ.output()
# select potential recombinants with p-value < 1e-3
df.recomb <- subset( df.recomb, adjp<1e-3 )
cat(paste('\nfound potential recombinants, n=',nrow(df.recomb)))
#
# remove potential recombinants from XML file
#
file <- '/Users/Oliver/duke/2014_Gates/methods_comparison_rootseqsim/140727/ALLv02.n100.rlx.gmrf_Sun_Jul_27_09-00-00_2014.xml'
bxml.template <- xmlTreeParse(file, useInternalNodes=TRUE, addFinalizer = TRUE)
bseq <- hivc.beast.get.sequences(bxml.template, verbose=1)
bseq <- merge(bseq, data.table(ALIGNMENT_ID=paste('alignment',1:3,sep=''), GENE=c('env','gag','pol')), by='ALIGNMENT_ID')
bseq <- subset(bseq, !TAXON_ID%in%df.recomb[, child])
#
# write XML file with new sequences
#
bxml <- newXMLDoc(addFinalizer=T)
bxml.beast <- newXMLNode("beast", doc=bxml, addFinalizer=T)
newXMLCommentNode(text=paste("Generated by HIVCLUST from template",file), parent=bxml.beast, doc=bxml, addFinalizer=T)
# add new set of ENV sequences into alignment ID 1
set( bseq, NULL, 'SEQ', bseq[, tolower(SEQ)] )
tmp <- subset(bseq, GENE=='env')
tmp2 <- tmp[, do.call('rbind',strsplit(tmp[, SEQ],''))]
rownames(tmp2) <- tmp[, TAXON_ID]
tmp2 <- as.DNAbin(tmp2)
bxml <- hivc.beast.add.seq(bxml, tmp2, df=NULL, beast.label.datepos= 2, beast.label.sep= '|', beast.date.direction= "forwards", beast.date.units= "years", beast.alignment.id="alignment1", beast.alignment.dataType= "nucleotide", verbose=1)
# add new set of GAG sequences into alignment ID 2
tmp <- subset(bseq, GENE=='gag')
tmp2 <- tmp[, do.call('rbind',strsplit(tmp[, SEQ],''))]
rownames(tmp2) <- tmp[, TAXON_ID]
tmp2 <- as.DNAbin(tmp2)
bxml <- hivc.beast.add.seq(bxml, tmp2, df=NULL, beast.label.datepos= 2, beast.label.sep= '|', beast.date.direction= "forwards", beast.date.units= "years", beast.alignment.id="alignment2", beast.alignment.dataType= "nucleotide", verbose=1)
# add new set of POL sequences into alignment ID 3
tmp <- subset(bseq, GENE=='pol')
tmp2 <- tmp[, do.call('rbind',strsplit(tmp[, SEQ],''))]
rownames(tmp2) <- tmp[, TAXON_ID]
tmp2 <- as.DNAbin(tmp2)
bxml <- hivc.beast.add.seq(bxml, tmp2, df=NULL, beast.label.datepos= 2, beast.label.sep= '|', beast.date.direction= "forwards", beast.date.units= "years", beast.alignment.id="alignment3", beast.alignment.dataType= "nucleotide", verbose=1)
# copy rest from template
bt.beast <- getNodeSet(bxml.template, "//beast")[[1]]
dummy <- sapply(seq.int( max(which( xmlSApply(bt.beast, xmlName)=="alignment" ))+1, xmlSize(bt.beast) ), function(i)
{
if( class(bt.beast[[i]])[1]=="XMLInternalCommentNode" )
dummy<- newXMLCommentNode(text=xmlValue(bt.beast[[i]]), parent=bxml.beast, doc=bxml, addFinalizer=T)
else
dummy<- addChildren( bxml.beast, xmlClone( bt.beast[[i]], addFinalizer=T, doc=bxml ) )
})
# change outfile name
bxml.onodes <- getNodeSet(bxml, "//*[@fileName]")
tmp <- sapply(bxml.onodes, function(x) xmlGetAttr(x,"fileName"))
tmp <- gsub("(time).","time",tmp,fixed=1)
tmp <- gsub("(subst).","subst",tmp,fixed=1)
tmp <- sapply(strsplit(tmp,'.',fixed=1), function(x) paste(outfile,'_',insignat, '.', tail(x,1), sep=''))
dummy <- sapply(seq_along(bxml.onodes), function(i){ xmlAttrs(bxml.onodes[[i]])["fileName"]<- tmp[i] })
# write to file
file <- paste(indir,'/',outfile,'_',insignat,".xml", sep='')
if(verbose) cat(paste("\nwrite xml file to",file))
saveXML(bxml, file=file)
if(0)
{
# get RAXML trees
triplets <- 1
#triplets <- 147:nrow(df.recomb)
dummy <- lapply(triplets, function(i)
{
i<- 1
if(verbose) cat(paste("\nprocess triplet number",i,"\n"))
argv <<- cmd.recombination.check.candidates(indir, infile.3seq, '', i, resume=resume, verbose=1, hpc.walltime=1, hpc.q=NA, hpc.mem='500mb', hpc.nproc=1)
argv <<- unlist(strsplit(argv,' '))
prog.recom.get.incongruence() #this starts ExaML for the ith triplet
})
}
}
##--------------------------------------------------------------------------------------------------------
## simulate sequence sampling and break up transmission chains by imports
##--------------------------------------------------------------------------------------------------------
project.PANGEA.SampleSeq.simulate<- function()
{
indir <- system.file(package="PANGEA.HIV.sim", "misc")
tmpdir.HPTN071 <- '/Users/Oliver/git/HPTN071sim/tmp140908'
infile.ind <- '140716_RUN001_IND.csv'
infile.trm <- '140716_RUN001_TRM.csv'
outfile.ind <- '140716_RUN001_IND.csv'
outfile.trm <- '140716_RUN001_TRM.csv'
dir.create(tmpdir.HPTN071, showWarnings=FALSE)
#get input into 'argv'. this is needed because the input parser is usually called from the command line, and 'argv' mimics the way input is provided when the parser is called from the command line
cmd <- cmd.HPTN071.input.parser(indir, infile.trm, infile.ind, tmpdir.HPTN071, infile.trm, infile.ind)
argv <<- unlist(strsplit(cmd,' '))
prog.HPTN071.input.parser.v1()
}
##--------------------------------------------------------------------------------------------------------
## pipeline to generate sequence data sets from HPTN071 output
##--------------------------------------------------------------------------------------------------------
pipeline.HPTN071<- function()
{
stop()
# example input files
indir <- system.file(package="PANGEA.HIV.sim", "misc")
indir <- ifelse(indir=='','/Users/Oliver/git/HPTN071sim/raw_trchain',indir)
infile.ind <- '140716_RUN001_IND.csv'
infile.trm <- '140716_RUN001_TRM.csv'
# re-name the following:
tmpdir <- '/Users/Oliver/git/HPTN071sim/tmp140908'
#
# pipeline start
#
## sample sequences and draw imports
cmd <- paste('mkdir -p ', tmpdir,'\n',sep='')
tmpdir.HPTN071 <- paste(tmpdir,'/HPTN071parser',sep='')
cmd <- paste(cmd, 'mkdir -p ', tmpdir.HPTN071,'\n',sep='')
cmd <- paste(cmd, cmd.HPTN071.input.parser.v2(indir, infile.trm, infile.ind, tmpdir.HPTN071, infile.trm, infile.ind), sep='\n')
## run virus tree simulator
tmpdir.VTS <- paste(tmpdir,'/VirusTreeSimulator',sep='')
cmd <- paste(cmd, 'mkdir -p ', tmpdir.VTS,'\n',sep='')
outfile <- substr(infile.ind, 1, nchar(infile.ind)-7)
prog.args <- '-demoModel Logistic -N0 100000 -growthRate 0.0001 -t50 -0.04'
cmd <- paste(cmd, cmd.VirusTreeSimulator(tmpdir.HPTN071, infile.trm, infile.ind, tmpdir.VTS, outfile, prog.args=prog.args), sep='\n')
## create seq gen input files
tmpdir.SG <- paste(tmpdir,'/SeqGen',sep='')
cmd <- paste(cmd, 'mkdir -p ', tmpdir.SG,'\n',sep='')
infile.epi <- paste( substr(infile.ind, 1, nchar(infile.ind)-7),'SAVE.R', sep='' )
infile.vts <- substr(infile.ind, 1, nchar(infile.ind)-7)
cmd <- paste(cmd, cmd.SeqGen.createInputFiles(indir, infile.epi, tmpdir.VTS, infile.vts, tmpdir.SG), sep='\n')
stop()
# currently requires output from last step so this is a separate batch file
cmd <- ''
infile.epi <- paste( substr(infile.ind, 1, nchar(infile.ind)-7),'SAVE.R', sep='' )
plot.file <- paste(indir, '/', substr(infile.epi,1,nchar(infile.epi)-6),'INFO_sg_ERPOSTERIOR_by_MODEL.pdf', sep='')
cmd <- paste(cmd, cmd.PANGEA.SeqGen(tmpdir.SG, infile.vts, plot.file=plot.file), sep='\n')
tmpdir.sim <- paste(tmpdir,'/Simu',sep='')
cmd <- paste(cmd, 'mkdir -p ', tmpdir.sim,'\n',sep='')
cmd <- paste(cmd, cmd.SeqGen.readOutputFiles(indir, infile.epi, tmpdir.SG, tmpdir.sim), sep='')
cat(cmd)
outfile <- paste("pngea",paste(strsplit(date(),split=' ')[[1]],collapse='_',sep=''),sep='.')
cmd.hpccaller(indir, outfile, cmd)
quit("no")
}
##--------------------------------------------------------------------------------------------------------
## devel code to process SeqGen output
##--------------------------------------------------------------------------------------------------------
project.PANGEA.SeqGen.readOutput<- function(indir.sg, infile.prefix)
{
label.idx.codonpos <- 1
label.idx.gene <- 2
label.idx.clu <- 3
treelabel.idx.idpop <- 1
treelabel.idx.sep <- '|'
indir.epi <- '/Users/Oliver/git/HPTN071sim/data_HPTN071epimodel_output'
infile.epi <- '140716_RUN001_SAVE.R'
indir.sg <- '/Users/Oliver/git/HPTN071sim/tmp/SeqGen'
outdir <- '/Users/Oliver/git/HPTN071sim/data_HPTN071epimodel_output'
# load simulated epi data
file <- paste(indir.epi, '/', infile.epi, sep='')
load(file) #expect "df.epi" "df.trms" "df.inds" "df.sample"
# collect simulated sequences
infiles <- list.files(indir.sg)
infiles <- infiles[ grepl('*phy$', infiles) ]
if(!length(infiles)) stop('cannot find files matching criteria')
infile.df <- data.table(FILE=infiles)
tmp <- infile.df[, strsplit(FILE, '_') ]
infile.df[, CODON_POS:= sapply(tmp, function(x) rev(x)[label.idx.codonpos])]
infile.df[, GENE:= sapply(tmp, function(x) rev(x)[label.idx.gene])]
infile.df[, IDCLU:= sapply(tmp, function(x) rev(x)[label.idx.clu])]
set(infile.df, NULL, 'CODON_POS', infile.df[, substr(CODON_POS,1,3)])
#
# read simulated sequences
#
df.seq <- infile.df[, {
cat(paste('\nread seq in file',FILE))
file <- paste(indir.sg,'/',FILE,sep='')
tmp <- as.character(read.dna(file, format='sequential'))
list( SEQ=apply(tmp,1,function(x) paste(x, collapse='')), LABEL=rownames(tmp) )
}, by='FILE']
df.seq <- merge(df.seq, infile.df, by='FILE')
#
# reconstruct genes from codon positions
#
df.seq[, STAT:=paste(GENE,CODON_POS,sep='.')]
df.seq <- dcast.data.table(df.seq, IDCLU + LABEL ~ STAT, value.var="SEQ")
# check that seq of correct size
stopifnot( df.seq[, all( nchar(GAG.CP1)==nchar(GAG.CP2) & nchar(GAG.CP1)==nchar(GAG.CP3) )] )
stopifnot( df.seq[, all( nchar(POL.CP1)==nchar(POL.CP2) & nchar(POL.CP1)==nchar(POL.CP3) )] )
stopifnot( df.seq[, all( nchar(ENV.CP1)==nchar(ENV.CP2) & nchar(ENV.CP1)==nchar(ENV.CP3) )] )
#
df.seq <- df.seq[, {
tmp <- do.call('rbind',sapply(list(ENV.CP1,ENV.CP2,ENV.CP3), strsplit, ''))
env <- paste(as.vector(tmp), collapse='')
tmp <- do.call('rbind',sapply(list(GAG.CP1,GAG.CP2,GAG.CP3), strsplit, ''))
gag <- paste(as.vector(tmp), collapse='')
tmp <- do.call('rbind',sapply(list(POL.CP1,POL.CP2,POL.CP3), strsplit, ''))
pol <- paste(as.vector(tmp), collapse='')
list(GAG=gag, POL=pol, ENV=env, IDCLU=IDCLU)
}, by=c('LABEL')]
# check that we have indeed seq for all sampled individuals
df.seq <- subset( df.seq, !grepl('NOEXIST',LABEL) )
tmp <- df.seq[, sapply( strsplit(LABEL, treelabel.idx.sep, fixed=TRUE), '[[', treelabel.idx.idpop )]
df.seq[, IDPOP:=as.integer(substr(tmp,7,nchar(tmp)))]
stopifnot( setequal( subset( df.inds, !is.na(TIME_SEQ) )[, IDPOP], df.seq[,IDPOP]) )
# merge simulated data
simu.df <- merge(subset(df.seq, select=c(IDPOP, GAG, POL, ENV)), subset( df.inds, !is.na(TIME_SEQ) ), by='IDPOP', all.x=TRUE)
#
# save simulated data -- internal
#
outfile.prefix <- substr(infile.epi,1,nchar(infile.epi)-6)
file <- paste(outdir, '/', outfile.prefix, 'SIMULATED_INTERNAL.R', sep='')
cat(paste('\nwrite to file', file))
save(df.epi, df.trms, df.inds, df.sample, df.seq, file=file)
#
# save simulated data -- to be shared
#
tmp <- subset( df.inds, !is.na(TIME_SEQ), select=c(IDPOP, GENDER, CIRCM, DOB, DOD, TIME_SEQ ) )
file <- paste(outdir, '/', outfile.prefix, 'SIMULATED_metadata.csv', sep='')
cat(paste('\nwrite to file', file))
write.csv(tmp, file)
tmp <- tolower(do.call('rbind',strsplit(df.seq[, GAG],'')))
rownames(tmp) <- df.seq[, LABEL]
tmp <- as.DNAbin(tmp)
file <- paste(outdir, '/', outfile.prefix, 'SIMULATED_gag.fa', sep='')
write.dna(tmp, file, format = "fasta")
tmp <- tolower(do.call('rbind',strsplit(df.seq[, POL],'')))
rownames(tmp) <- df.seq[, LABEL]
tmp <- as.DNAbin(tmp)
file <- paste(outdir, '/', outfile.prefix, 'SIMULATED_pol.fa', sep='')
write.dna(tmp, file, format = "fasta")
tmp <- tolower(do.call('rbind',strsplit(df.seq[, ENV],'')))
rownames(tmp) <- df.seq[, LABEL]
tmp <- as.DNAbin(tmp)
file <- paste(outdir, '/', outfile.prefix, 'SIMULATED_env.fa', sep='')
write.dna(tmp, file, format = "fasta")
# zip simulated files
tmp <- c( paste(outdir, '/', outfile.prefix, 'SIMULATED_metadata.csv', sep=''), paste(outdir, '/', outfile.prefix, 'SIMULATED_env.fa', sep=''), paste(outdir, '/', outfile.prefix, 'SIMULATED_pol.fa', sep=''), paste(outdir, '/', outfile.prefix, 'SIMULATED_gag.fa', sep='') )
zip( paste(outdir, '/', outfile.prefix, 'SIMULATED.zip', sep=''), tmp, flags = "-Fr9XTj")
file.remove(tmp)
}
##--------------------------------------------------------------------------------------------------------
## devel code to call VirusTreeSimulator
##--------------------------------------------------------------------------------------------------------
project.PANGEA.VirusTreeSimulator.v1<- function()
{
require(data.table)
indir <- '/Users/Oliver/git/HPTN071sim/tmp/HPTN071parser'
outdir <- '/Users/Oliver/duke/2014_Gates/methods_comparison_trchphylosim/140819'
infile.ind <- '140716_RUN001_IND.csv'
infile.trm <- '140716_RUN001_TRM.csv'
outfile <- '140716_RUN001_VirusTreeSim_'
# N0 effective pop size at time 0
# growthRate the effective population size growth rate
# t50 the time point, relative to the time of infection in backwards time, at which the population is equal to half its final asymptotic value
cmd <- cmd.VirusTreeSimulator(indir, infile.trm, infile.ind, outdir, outfile, prog.args='-demoModel Logistic -N0 0.1 -growthRate 1.5 -t50 -4')
cmd <- cmd.VirusTreeSimulator(indir, infile.trm, infile.ind, outdir, outfile, prog.args='-demoModel Logistic -N0 100000 -growthRate 0.0001 -t50 -0.04')
outdir <- '/Users/Oliver/duke/2014_Gates/methods_comparison_trchphylosim/140827'
cmd <- cmd.VirusTreeSimulator(indir, infile.trm, infile.ind, outdir, outfile, prog.args='-demoModel Constant -N0 15000')
cat(cmd)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.