.onAttach <- function(...)
{
packageStartupMessage("PANGEA.HIV.sim\nHIV Phylogenetic Simulation Model for the PANGEA-HIV methods comparison exercise\nhttps://github.com/olli0601/PANGEA.HIV.sim")
}
.onLoad <- function(...)
{
suppressMessages(library(gamlss))
suppressMessages(library(ape))
suppressMessages(library(phytools))
suppressMessages(library(reshape2))
suppressMessages(library(data.table))
suppressMessages(library(RColorBrewer))
suppressMessages(library(grid))
suppressMessages(library(ggplot2))
suppressMessages(library(igraph))
suppressMessages(library(distr))
}
##--------------------------------------------------------------------------------------------------------
# olli copied from hivclust
##--------------------------------------------------------------------------------------------------------
seq.unique<- function(seq.DNAbin.matrix)
{
x<- as.character(seq.DNAbin.matrix)
x<- apply(x, 1, function(z) paste(z,collapse=''))
seq.DNAbin.matrix[!duplicated(x),]
}
##--------------------------------------------------------------------------------------------------------
# olli copied from hivclust
##--------------------------------------------------------------------------------------------------------
seq.rmgaps<- function(seq.DNAbin.matrix, rm.only.col.gaps=1, rm.char='-', verbose=0)
{
if(class(seq.DNAbin.matrix)=='DNAbin')
seq.DNAbin.matrix <- as.character(seq.DNAbin.matrix)
if(!rm.only.col.gaps)
{
if(is.matrix(seq.DNAbin.matrix))
{
tmp <- lapply(seq_len(nrow(seq.DNAbin.matrix)), function(i){ seq.DNAbin.matrix[i, !seq.DNAbin.matrix[i,]%in%rm.char] })
names(tmp) <- rownames(seq.DNAbin.matrix)
}
else
{
tmp <- lapply(seq_along(seq.DNAbin.matrix), function(i){ seq.DNAbin.matrix[[i]][ !seq.DNAbin.matrix[[i]]%in%rm.char] })
names(tmp) <- names(seq.DNAbin.matrix)
}
seq.DNAbin.matrix <- tmp
}
else
{
gap <- apply(seq.DNAbin.matrix,2,function(x) all(x%in%rm.char))
if(verbose) cat(paste("\nremove gaps, n=",length(which(gap))))
if(verbose>1) cat(paste("\nremove gaps, at pos=",which(gap)))
seq.DNAbin.matrix <- seq.DNAbin.matrix[,!gap]
}
as.DNAbin( seq.DNAbin.matrix )
}
##--------------------------------------------------------------------------------------------------------
# olli copied from hivclust
##--------------------------------------------------------------------------------------------------------
seq.singleton2bifurcatingtree<- function(ph.s, dummy.label=NA)
{
if(!Nnode(ph.s))
{
stopifnot(!nrow(ph.s$edge))
if(is.na(dummy.label))
dummy.label <- paste('DUMMY',ph.s$tip.label, sep='_')
ph.s$edge <- matrix(c(3,1,3,2), nrow=2, ncol=2, byrow=TRUE)
ph.s$edge.length <- c(ph.s$root.edge,0)
ph.s$tip.label <- c(ph.s$tip.label, dummy.label)
ph.s$root.edge <- 0
ph.s$Nnode <- 1
}
ph.s
}
##--------------------------------------------------------------------------------------------------------
# olli 23.07.2016
##--------------------------------------------------------------------------------------------------------
#' @import data.table ape
#' @export
seq.bootstrap.gd<- function(seq, seqi, tp, repn=10, bsn=1e2, bs.seed=42)
{
#for each pair, estimate: actual distance, mean distance, variance in distance:
# (do this pairwise because otherwise too computationally expensive
# by gene
tpi <- tp[, {
cat('IDX',IDX, round(IDX/nrow(tp),d=3))
#TAXA1 <- 'IDPOP_13649|M|DOB_1906.66|2011.23'; TAXA2<- 'IDPOP_27993|F|DOB_1961.29|1991.587'
#START <- 1; END<- 1473
#system.time({
df.gd <- seqi[, {
spc <- as.character(seq[c(TAXA1,TAXA2), START:END])
# use same seed across all bootstrap runs, ie running for every gene is the same as running for the full genome
# and running for every taxon pair is the same as running for the whole alignment
set.seed(42)
tmp <- as.data.table(expand.grid(REP=seq_len(repn), BS=seq_len(bsn+1)))
tmp <- tmp[, {
# take bootstrap sample (except if bsi==1)
# the bootstrap is relative to the gene region!
spcb <- copy(spc)
if(BS>1)
{
# note: bootstrap includes ? columns, which adds uncertainty when genetic distances are evaluated over the overlap columns
spcb<- spcb[, sample(ncol(spcb), replace=TRUE)]
}
spb <- as.DNAbin(spcb)
# overlap that is not '?'
do <- sum(apply( spcb!='?', 2, prod))
# count genetic distance on overlap region, ie count gaps '-' as well, on anything that is not '?'
dn <- as.numeric( dist.dna(spb, model='N', pairwise.deletion=TRUE) )
# add indels to differences, but not when the other sequence is '?'
tmp <- which( !apply(spcb=='?', 2, any) )
if(length(tmp))
dn <- dn + as.numeric(dist.dna(spb[, tmp], model='indel'))
# DN can be > 0 if DO is 0, because of indels
list(DN=dn, DO=do)
}, by=c('REP','BS')]
tmp
}, by='GENE']
# collect distances for genes
ans <- subset(df.gd, BS>1)[, list( GD_MEAN=mean(DN/DO), GD_SD=sd(DN/DO) ), by=c('GENE','REP')]
tmp <- subset(df.gd, BS==1)[, list( GD=ifelse(DO==0, NA_real_, DN/DO), DO=DO ), by=c('GENE','REP')]
ans <- merge(tmp, ans, by=c('GENE','REP'))
# calculate distances for full genome
tmp <- subset(df.gd, GENE%in%c('gag','pol','env'))[, list(GENE='gag+pol+env', GD=sum(DN)/sum(DO), DO=sum(DO)), by=c('REP','BS')]
tmp <- tmp[, list(GD= GD[BS==1], DO=DO[BS==1], GD_MEAN=mean(GD[BS>1]), GD_SD=sd(GD[BS>1])), by=c('GENE','REP')]
ans <- rbind(ans, tmp)
#})
ans
}, by=c('TAXA1','TAXA2')]
tpi
}
##--------------------------------------------------------------------------------------------------------
# olli
##--------------------------------------------------------------------------------------------------------
seq.get.gap.chunks<- function(sq, gap.symbol=c('?','n'))
{
ch <- lapply(seq_len(nrow(sq)), function(i)
{
z <- gregexpr('1+', paste(as.numeric( as.character( sq[i,] )%in%gap.symbol ), collapse='') )[[1]]
data.table(TAXON= rownames(sq)[i], POS=as.integer(z), REP=attr(z,"match.length"))
})
ch <- do.call('rbind',ch)
ch <- subset(ch, POS>0)
ch <- merge(ch, ch[, list(COV=sum(REP)), by='TAXON'], by='TAXON')
ch[, COVP:= COV/ncol(sq)]
ch[, POS_NEXT:= POS+REP]
return( ch )
}
##--------------------------------------------------------------------------------------------------------
# olli copied from hivclust
##--------------------------------------------------------------------------------------------------------
seq.addrootnode<- function(ph, dummy.label)
{
ph$tip.label <- c(ph$tip.label, dummy.label)
tmp <- which( ph$edge[,1]>=Ntip(ph) )
ph$edge[tmp,1] <- ph$edge[tmp,1] + 2
tmp <- which( ph$edge[,2]>=Ntip(ph) )
ph$edge[tmp,2] <- ph$edge[tmp,2] + 2
if(Nnode(ph)>0)
ph$edge <- rbind( matrix(c(Ntip(ph)+1,Ntip(ph)+1,Ntip(ph),Ntip(ph)+2), nrow=2, ncol=2), ph$edge)
if(Nnode(ph)==0)
ph$edge <- rbind( matrix(c(Ntip(ph)+1,Ntip(ph)+1,Ntip(ph),Ntip(ph)-1), nrow=2, ncol=2), ph$edge)
ph$edge.length <- c(0,ph$root.edge, ph$edge.length)
ph$root.edge <- 0
ph$Nnode <- ph$Nnode+1
ph
}
##--------------------------------------------------------------------------------------------------------
# olli copied from hivclust
##--------------------------------------------------------------------------------------------------------
seq.collapse.singles<- function (tree)
{
elen <- tree$edge.length
xmat <- tree$edge
node.lab <- tree$node.label
nnode <- tree$Nnode
ntip <- length(tree$tip.label)
root <- 0
singles <- NA
while (length(singles) > 0)
{
tx <- tabulate(xmat[, 1])
singles <- which(tx == 1)
if (length(singles) > 0)
{
i <- singles[1]
prev.node <- which(xmat[, 2] == i)
next.node <- which(xmat[, 1] == i)
xmat[prev.node, 2] <- xmat[next.node, 2]
xmat <- xmat[xmat[, 1] != i, , drop=0]
xmat[xmat > i] <- xmat[xmat > i] - 1L
if(!length(prev.node))
root <- root + elen[next.node]
if(length(prev.node))
elen[prev.node] <- elen[prev.node] + elen[next.node]
if (!is.null(node.lab))
node.lab <- node.lab[-c(i - ntip)]
nnode <- nnode - 1L
elen <- elen[-next.node]
}
}
tree$edge <- xmat
tree$edge.length <- elen
tree$node.label <- node.lab
tree$Nnode <- nnode
tree$root.edge <- root
tree
}
##--------------------------------------------------------------------------------------------------------
# olli copied from hivclust
##--------------------------------------------------------------------------------------------------------
seq.read.newick<- function (file = "", text)
{
if (file != "")
text <- scan(file, sep = "\n", what = "character")
Nnode <- length(gregexpr("\\(", text)[[1]])
Ntip <- 1+length(gregexpr(",", text)[[1]])
tree <- unlist(strsplit(text, NULL))
tip.label <- vector(mode = "character")
node.label <- vector(mode = 'character')
edge <- matrix(data = 0, Nnode + Ntip - 1, 2)
edge.length <- rep(0, Nnode + Ntip - 1)
ntip <- vector(mode = "numeric")
currnode <- Ntip + 1
nodecount <- currnode
i <- 1
j <- 1
k <- 1
while(tree[i] != ";")
{
if(tree[i] == "(")
{
edge[j, 1] <- currnode
i <- i + 1
if(is.na(match(tree[i], c("(", ")", ",", ":", ";"))))
{
l <- gregexpr(",|:|\\)", substr(text, i, nchar(text)))[[1]][1]
stopifnot(l>0)
tip.label[k] <- substr(text, i, i+l-2)
i <- i+l-1
edge[j, 2] <- k
k <- k + 1
ntip[j] <- 1
if (tree[i] == ":")
{
i <- i + 1
l <- gregexpr(",|\\)", substr(text, i, nchar(text)))[[1]][1]
stopifnot(l>0)
edge.length[j] <- as.numeric(substr(text, i, i+l-2))
i <- i+l-1
}
}
else if(tree[i] == "(")
{
nodecount <- nodecount + 1
currnode <- nodecount
edge[j, 2] <- currnode
}
j <- j + 1
}
else if(tree[i] == ")")
{
i <- i + 1
if(is.na(match(tree[i], c(":", ")"))))
{
l <- gregexpr(":|;|\\)", substr(text, i, nchar(text)))[[1]][1]
stopifnot(l>0)
node.label[currnode-Ntip] <- substr(text, i, i+l-2)
i <- i+l-1
}
if(tree[i] == ":")
{
i <- i + 1
l <- gregexpr(",|\\)", substr(text, i, nchar(text)))[[1]][1]
stopifnot(l>0)
edge.length[match(currnode, edge[, 2])] <- as.numeric(substr(text, i, i+l-2))
i <- i+l-1
}
ntip[match(currnode, edge[, 2])] <- sum(ntip[which(edge[, 1] == currnode)])
currnode <- edge[match(currnode, edge[, 2]), 1]
}
else if(tree[i]==",")
{
edge[j, 1] <- currnode
i <- i + 1
if(is.na(match(tree[i], c("(", ")", ",", ":", ";"))))
{
l <- gregexpr(",|:|\\)", substr(text, i, nchar(text)))[[1]][1]
stopifnot(l>0)
tip.label[k] <- substr(text, i, i+l-2)
i <- i+l-1
edge[j, 2] <- k
k <- k + 1
ntip[j] <- 1
if (tree[i] == ":")
{
i <- i + 1
l <- gregexpr(",|\\)", substr(text, i, nchar(text)))[[1]][1]
stopifnot(l>0)
edge.length[j] <- as.numeric(substr(text, i, i+l-2))
i <- i+l-1
}
}
else if (tree[i] == "(")
{
nodecount <- nodecount + 1
currnode <- nodecount
edge[j, 2] <- currnode
}
j <- j + 1
}
}
tmp <- which(edge[,1]==0)
if(length(tmp))
{
edge <- edge[-tmp,]
edge.length <- edge.length[-tmp]
tmp <- sort( unique( as.numeric( edge ) ) )
tmp <- rbind(tmp, seq_along(tmp))
tmp <- sapply( as.numeric( edge ), function(j) tmp[2, match(j, tmp[1,])] )
edge <- matrix(tmp, ncol=2)
}
phy <- list(edge = edge, Nnode = as.integer(Nnode), tip.label = tip.label, Ndesc = ntip)
if(sum(edge.length) > 1e-08)
phy$edge.length <- edge.length
if(length(node.label))
phy$node.label <- node.label
class(phy) <- "phylo"
return(phy)
}
##--------------------------------------------------------------------------------------------------------
# olli copied from hivclust
##--------------------------------------------------------------------------------------------------------
hivc.beast2out.read.nodestats <- function(bstr)
{
# remove anything before first '('
bstr <- regmatches(bstr, regexpr('\\(.*',bstr))
# store meta info for inner nodes that is given in [], and not in :[] which is meta info for edges
tmp <- unlist(regmatches(bstr,gregexpr('[^:]\\[[^]]+',bstr)))
stopifnot(length(tmp)>0)
tmp <- sapply( tmp, function(x) substr(x, 4, nchar(x)) )
# for each inner node, extract stats
tmp <- strsplit(tmp, ',')
tmp <- lapply(seq_along(tmp), function(i)
{
z<- strsplit(tmp[[i]],'=')
data.table(NODE_PARSE_ID=i, STAT=sapply(z,'[',1), VALUE=sapply(z,'[',2))
})
node.stat <- do.call('rbind', tmp)
tmp <- node.stat[, unique(STAT)]
cat(paste('\nFound node statistics=',paste(tmp,collapse=' ')))
tmp <- node.stat[, list(has.all.stats= !length(setdiff(tmp, STAT)) ) , by='NODE_PARSE_ID']
tmp <- subset(tmp, !has.all.stats)[, NODE_PARSE_ID]
cat(paste('\nSome statistics missing for nodes=',paste(tmp,collapse=' ')))
node.stat
}
##--------------------------------------------------------------------------------------------------------
# olli copied from hivclust
##--------------------------------------------------------------------------------------------------------
hivc.beast2out.read.nodeidtree <- function(bstr, method.node.stat='any.node')
{
# strip all meta variables and ; at end
bstr <- gsub("\\[[^]]*\\]", "", bstr)
bstr <- gsub(';','',bstr)
# for each node, add a dummy node label NODE_PARSE_IDxx
dummy.tree <- unlist(strsplit(bstr, ":"))
if(method.node.stat=='inner.node')
{
# interior branch length: previous index ends in ). so tmp is the index of the dummy.tree chunks that gives the start of a branch length of an inner node
tmp <- which( c(FALSE, grepl(')$',dummy.tree)[-length(dummy.tree)]) )
# prepend NODE_PARSE_IDxx before the branch length of an inner node
tmp <- tmp-1
}
if(method.node.stat=='any.node')
tmp <- seq_along(dummy.tree)
dummy.tree <- sapply(seq_along(dummy.tree), function(i)
{
z<- which(i==tmp)
ifelse(length(z), paste(dummy.tree[i],'NODE_PARSE_ID',z,sep=''), dummy.tree[i] )
})
dummy.tree <- paste(dummy.tree, collapse=':',sep='')
dummy.tree <- regmatches(dummy.tree, regexpr('\\(.*',dummy.tree))
dummy.tree <- paste(dummy.tree, ';', sep='')
ph <- seq.read.newick(text=dummy.tree)
ph
}
##--------------------------------------------------------------------------------------------------------
# olli copied from hivclust
##--------------------------------------------------------------------------------------------------------
hivc.beast2out.read.nexus.and.stats<- function(file, tree.id=NA, method.node.stat='any.node')
{
stopifnot(method.node.stat%in%c('any.node','inner.node'))
X <- scan(file = file, what = "", sep = "\n", quiet = TRUE)
# read TRANSLATE chunk
X.endblock <- grep("END;|ENDBLOCK;|End;", X, ignore.case = TRUE)
X.semico <- grep(";", X)
X.i1 <- grep("BEGIN TREES;|Begin trees;", X, ignore.case = TRUE)
X.i2 <- grep("TRANSLATE|Translate", X, ignore.case = TRUE)
tmp <- X.semico[X.semico > X.i2][1]
tmp <- X[(X.i2 + 1):tmp]
tmp <- gsub('[,;]$','',gsub('^\\s+','',tmp))
tmp <- tmp[nzchar(tmp)]
tmp <- strsplit(tmp, ' ')
df.translate <- data.table(NEXUS_ID= sapply(tmp, '[[', 1), NEXUS_LABEL=sapply(tmp, '[[', 2) )
set(df.translate, NULL, 'NEXUS_LABEL', df.translate[, gsub("\'","",NEXUS_LABEL)])
cat(paste('\nFound taxa, n=', nrow(df.translate)))
if(!is.na(tree.id))
{
# read one newick tree with id 'tree.id'
bstr <- X[grep(paste(tree.id,"[[:space:]]+",sep=''), X)]
node.stat <- hivc.beast2out.read.nodestats(bstr)
cat(paste('\nFound node statistics, n=', nrow(node.stat)))
set(node.stat, NULL, 'tree.id', tree.id[i] )
btree <- hivc.beast2out.read.nodeidtree(bstr, method.node.stat=method.node.stat)
#
# link node.stats with tree nodes (tip + inner node)
# NODE_ID is index of node in 'btree' phylo object
#
tmp <- strsplit( btree$tip.label, 'NODE_PARSE_ID' )
df.link <- data.table(NODE_ID=seq_along(btree$tip.label), NEXUS_ID=sapply(tmp,'[[',1), NODE_PARSE_ID=sapply(tmp,'[[',2))
df.link <- merge(df.link, df.translate, by='NEXUS_ID')
cat(paste('\nFound tree tips with taxon name, n=', nrow(df.link)))
tmp <- strsplit( btree$node.label, 'NODE_PARSE_ID' )
tmp <- data.table(NODE_ID=Ntip(btree)+seq_along(btree$node.label), NODE_PARSE_ID=sapply(tmp,'[[',2), NEXUS_LABEL=NA_character_)
df.link <- rbind(subset(df.link,select=c(NODE_ID, NODE_PARSE_ID, NEXUS_LABEL)), tmp)
set(df.link,NULL,'NODE_PARSE_ID',df.link[, as.integer(NODE_PARSE_ID)])
set(df.link,NULL,'NODE_ID',df.link[, as.integer(NODE_ID)])
set(df.link,NULL,'TREE_ID',tree.id)
node.stat <- merge( node.stat, subset(df.link, select=c(NODE_PARSE_ID, NODE_ID, TREE_ID)), by='NODE_PARSE_ID' )
set(node.stat,NULL,'NODE_PARSE_ID',NULL)
cat(paste('\nLinked node statistics to tree nodes, n=', nrow(node.stat)))
#
# set tip.labels and rm node.labels
#
setkey(df.link, NODE_ID)
btree$tip.label <- df.link[seq_len(Ntip(btree)),][,NEXUS_LABEL]
btree$node.label <- NULL
}
if(is.na(tree.id))
{
# read all newick trees in nexus file
tmp <- regexpr('^tree\\s\\S+',X)
tree.id <- sapply( regmatches(X,tmp), function(x) substr(x, 5, nchar(x)))
tree.id <- gsub('\\s','',tree.id)
cat(paste('\nFound tree id=', paste(tree.id, collapse=' ')))
X <- X[ which(tmp>0) ]
cat(paste('\nFound trees, n=',length(tree.id)))
node.stat <- lapply(seq_along(tree.id), function(i)
{
bstr <- X[grep(paste(tree.id[i],"[[:space:]]+",sep=''), X)]
cat(paste('\nGet node statistics for tree id=',tree.id[i]))
tmp <- hivc.beast2out.read.nodestats(bstr)
set(tmp, NULL, 'TREE_ID', tree.id[i] )
tmp
})
if(length(node.stat)>1)
node.stat <- do.call('rbind',node.stat)
if(length(node.stat)==1)
node.stat <- node.stat[[1]]
suppressWarnings({ node.stat[, NODE_ID:=NA_integer_] })
setkey(node.stat, TREE_ID, NODE_PARSE_ID)
btree <- vector('list',length(tree.id))
for(i in seq_along(tree.id))
{
bstr <- X[grep(paste(tree.id[i],"[[:space:]]+",sep=''), X)]
cat(paste('\nRead tree for tree id=',tree.id[i]))
btree.i <- hivc.beast2out.read.nodeidtree(bstr, method.node.stat=method.node.stat)
#
# link node.stats with tree nodes (tip + inner node)
# NODE_ID is index of node in 'btree.i' phylo object
#
tmp <- strsplit( btree.i$tip.label, 'NODE_PARSE_ID' )
df.link <- data.table(NODE_ID=seq_along(btree.i$tip.label), NEXUS_ID=sapply(tmp,'[[',1), NODE_PARSE_ID=sapply(tmp,'[[',2))
df.link <- merge(df.link, df.translate, by='NEXUS_ID')
cat(paste('\nFound tree tips with taxon name, n=', nrow(df.link)))
tmp <- strsplit( btree.i$node.label, 'NODE_PARSE_ID' )
tmp <- data.table(NODE_ID=Ntip(btree.i)+seq_along(btree.i$node.label), NODE_PARSE_ID=sapply(tmp,'[[',2), NEXUS_LABEL=NA_character_)
df.link <- rbind(subset(df.link,select=c(NODE_ID, NODE_PARSE_ID, NEXUS_LABEL)), tmp)
set(df.link,NULL,'NODE_PARSE_ID',df.link[, as.integer(NODE_PARSE_ID)])
set(df.link,NULL,'NODE_ID',df.link[, as.integer(NODE_ID)])
for(j in seq_len(nrow(df.link)))
set(node.stat, node.stat[, which(TREE_ID==tree.id[i] & NODE_PARSE_ID==df.link[j,NODE_PARSE_ID])], 'NODE_ID', df.link[j,NODE_ID])
tmp <- node.stat[, length(which(!is.na(NODE_ID)))]
cat(paste('\nTotal linked node statistics to tree nodes, n=', tmp ))
#
# set tip.labels and rm node.labels
#
setkey(df.link, NODE_ID)
btree.i$tip.label <- df.link[seq_len(Ntip(btree.i)),][,NEXUS_LABEL]
btree.i$node.label <- NULL
btree[[i]] <- btree.i
}
if(length(btree)>=2)
{
names(btree) <- tree.id
class(btree) <- "multiPhylo"
}
if(length(btree)<2)
btree <- btree[[1]]
tmp <- node.stat[, length(which(is.na(NODE_ID)))]
cat(paste('\nTotal unlinked node statistics [should be zero], n=', tmp ))
set(node.stat,NULL,'NODE_PARSE_ID',NULL)
}
list(tree=btree, node.stat=node.stat)
}
##--------------------------------------------------------------------------------------------------------
# return distribution of GTR parameters
# olli originally written 20-09-2014
##--------------------------------------------------------------------------------------------------------
#' @title Create data.table of GTR parameters
#' @description Returns a data.table of GTR parameters.
#' @return data.table
PANGEA.GTR.params<- function()
{
file <- system.file(package="PANGEA.HIV.sim", "misc",'PANGEA_SSAfgBwhRc-_140907_n390_BEASTlog.R')
cat(paste('\nreading GTR parameters from file',file))
load(file) # expect log.df
# exclude odd BEAST runs
log.df <- subset(log.df, !(GENE=='ENV' & FILE=='pool3'))
# exclude cols
log.df[, ucld.mean:=NULL]
log.df[, ucld.stdev:=NULL]
log.df[, coefficientOfVariation:=NULL]
log.df[, treeModel.rootHeight:=NULL]
# check that relative rates have mean 1
stopifnot( log.df[, list(CHECK=sum(mu)), by=c('FILE','GENE','state')][, !any(abs(CHECK-3)>2*1e-12)] )
# get mean rate. need to be a bit careful here: rate is per site, so length of gene does not matter
# but we may have different numbers of samples for each gene
tmp <- log.df[, list(meanRate=mean(meanRate)), by='GENE'][, mean(meanRate)]
# ignore variation in meanRate by gene, keep variation across codon_pos for each state
log.df <- merge(log.df, log.df[, list(mu.gene= mean(meanRate)/tmp), by='GENE'], by='GENE')
set(log.df, NULL, 'mu', log.df[, mu*mu.gene])
set(log.df, NULL, 'meanRate', tmp)
set(log.df, NULL, 'mu.gene', NULL)
# plot
if(0)
{
pr <- copy(log.df)
set(pr, NULL, c('alpha','a','c','g','t','meanRate'), NULL)
setnames(pr, 'mu', 'relative\nevolutionary rate')
setnames(pr, 'at', 'relative\nsubstitution rate\na->t')
setnames(pr, 'ac', 'relative\nsubstitution rate\na->c')
setnames(pr, 'ag', 'relative\nsubstitution rate\na->g')
setnames(pr, 'cg', 'relative\nsubstitution rate\nc->g')
setnames(pr, 'gt', 'relative\nsubstitution rate\ng->t')
set(pr, NULL, 'GENE', pr[, factor(GENE, levels=c('GAG','POL','ENV'), labels=c('gag','pol','env'))])
set(pr, NULL, 'CODON_POS', pr[, factor(CODON_POS, levels=c('CP1','CP2','CP3'), labels=c('codon\nposition 1','codon\nposition 2','codon\nposition 3'))])
pr <- melt(pr, measure.vars=c('relative\nevolutionary rate','relative\nsubstitution rate\na->t','relative\nsubstitution rate\na->c','relative\nsubstitution rate\nc->g','relative\nsubstitution rate\na->g','relative\nsubstitution rate\ng->t'))
ggplot(pr, aes(x=value)) +
geom_histogram(aes(y = ..density..), bins=50) +
facet_grid(GENE+CODON_POS~variable, scales='free') +
theme_bw() + labs(x='', y='sampling distribution')
ggsave(file='/Users/Oliver/Dropbox (SPH Imperial College)/2015_PANGEA_MCE_manuscript/figures/Supp_Figure_Regional_GTR.pdf', w=10, h=10)
}
log.df
}
##--------------------------------------------------------------------------------------------------------
# simulate imports during the epidemic
# olli originally written 27-01-2015
##--------------------------------------------------------------------------------------------------------
PANGEA.ImportSimulator.SimulateStartingTimeOfIndexCase.v2<- function(df.ind, df.trm, index.starttime.mode='normal')
{
stopifnot(grepl('normal|fix|shift', index.starttime.mode))
# add transmission time for index case -- this is 40 years back in time so we can sample a starting sequence
# and then generate a long branch to the transmission chain in the population. No hack. :-)
tmp <- subset( df.trm, IDTR<0, select=IDTR )
if( index.starttime.mode == 'normal' )
{
cat(paste('\nUsing index.starttime.mode rnorm(n, 1955, 7)'))
tmp2 <- rnorm(2*nrow(tmp), 1955, 7)
tmp2 <- tmp2[ tmp2>1946 & tmp2<1980]
stopifnot( nrow(tmp)<=length(tmp2) )
length(tmp2)<- nrow(tmp)
set(tmp, NULL, 'IDTR_TIME_INFECTED.new', tmp2 )
}
if( grepl('fix',index.starttime.mode) )
{
tmp2 <- as.numeric(substring(index.starttime.mode, 4))
stopifnot(tmp2<=1980)
cat(paste('\nUsing index.starttime.mode rep=', tmp2))
set(tmp, NULL, 'IDTR_TIME_INFECTED.new', rep(tmp2, nrow(tmp)) )
}
if( grepl('shift',index.starttime.mode))
{
tmp2 <- as.numeric(substring(index.starttime.mode, 6))
cat(paste('\nUsing index.starttime.mode rep=', tmp2))
set(tmp, NULL, 'IDTR_TIME_INFECTED.new', rep(tmp2, nrow(tmp)) )
}
if( index.starttime.mode == 'fix45')
{
cat(paste('\nUsing index.starttime.mode runif( n, 1945, 1945.5 )'))
tmp2 <- runif( nrow(tmp), 1945, 1945.5 )
set(tmp, NULL, 'IDTR_TIME_INFECTED.new', tmp2 )
}
#
df.trm <- merge( df.trm, tmp, by='IDTR', all.x=TRUE )
tmp2 <- df.trm[, which(!is.na(IDTR_TIME_INFECTED.new) & IDTR_TIME_INFECTED<IDTR_TIME_INFECTED.new)]
set(df.trm, tmp2, 'IDTR_TIME_INFECTED.new', df.trm[tmp2, IDTR_TIME_INFECTED])
#
tmp2 <- df.trm[, which(!is.na(IDTR_TIME_INFECTED.new))]
set(df.trm, tmp2, 'IDTR_TIME_INFECTED', df.trm[tmp2, IDTR_TIME_INFECTED.new])
df.trm[, IDTR_TIME_INFECTED.new:=NULL]
#
stopifnot( nrow(subset(df.trm, TIME_TR<IDTR_TIME_INFECTED))==0 )
stopifnot( nrow(subset(df.trm, is.na(TIME_TR)))==0 )
stopifnot( nrow(subset(df.trm, is.na(IDTR_TIME_INFECTED)))==0 )
#
tmp <- subset( df.trm, IDTR<0, select=c(IDTR, IDTR_TIME_INFECTED) )
setnames(tmp, c('IDTR','IDTR_TIME_INFECTED'), c('IDPOP','IDTR_TIME_INFECTED.new'))
df.ind <- merge(df.ind, tmp, by='IDPOP', all.x=TRUE)
tmp2 <- df.ind[, which(!is.na(IDTR_TIME_INFECTED.new))]
set(df.ind, tmp2, 'TIME_TR', df.ind[tmp2, IDTR_TIME_INFECTED.new])
df.ind[, IDTR_TIME_INFECTED.new:=NULL]
list(df.ind=df.ind, df.trm=df.trm)
}
##--------------------------------------------------------------------------------------------------------
# simulates seroprevalence survey of a proportion of a population
# olli originally written 29-01-2015
##--------------------------------------------------------------------------------------------------------
PANGEA.SeroPrevalenceSurvey<- function(df.inds, epi.adult=13, s.INTERVENTION.start=2015, sp.prop.of.sexactive=0.05, sp.times=c(12, 8, 4, 0), verbose=1, test=0)
{
df.sp <- lapply( s.INTERVENTION.start-sp.times, function(yr)
{
yr <- yr-1/365
sxon.all <- which( (df.inds[['DOB']]+epi.adult)<=yr & df.inds[['DOD']]>yr )
sxon.sp <- sort(sample(sxon.all, round(length(sxon.all)*sp.prop.of.sexactive) ))
if(verbose)
{
cat(paste('\nSero prevalence survey in year', yr))
cat(paste('\nTotal number of individuals=', length(sxon.all)))
cat(paste('\nProp to include in survey=', sp.prop.of.sexactive))
cat(paste('\nTotal number of individuals in survey=', length(sxon.sp)))
}
df.sp <- df.inds[sxon.sp, ]
df.sp[, YR:=yr]
df.sp[, AGE:= df.sp[, cut(YR-DOB, breaks=c(epi.adult-1, 16, 20, 25, 30, 35, 40, 50, 60, Inf))]]
df.sp
})
df.sp <- do.call('rbind', df.sp)
if(test)
df.sp <- df.sp[, list( ALIVE_N=length(IDPOP), ALIVE_AND_HIV_N=length(which(TIME_TR<=YR)), ALIVE_AND_DIAG_N=length(which(DIAG_T<=YR)), ALIVE_AND_ART_N=length(which(ART1_T<=YR)), ALIVE_AND_SEQ_N=length(which(TIME_SEQ<=YR)) ), by=c('YR', 'GENDER', 'AGE')]
if(!test)
df.sp <- df.sp[, list( ALIVE_N=length(IDPOP), ALIVE_AND_DIAG_N=length(which(DIAG_T<=YR)), ALIVE_AND_ART_N=length(which(ART1_T<=YR)), ALIVE_AND_SEQ_N=length(which(TIME_SEQ<=YR)) ), by=c('YR', 'GENDER', 'AGE')]
setkey(df.sp, YR, GENDER, AGE)
df.sp
}
##--------------------------------------------------------------------------------------------------------
# simulate imports during the epidemic
# olli originally written 11-09-2014
##--------------------------------------------------------------------------------------------------------
PANGEA.ImportSimulator.SimulateIndexCase<- function(df.ind, df.trm, epi.import)
{
# model imports by re-assigning a fraction of infecteds as 'index cases'
# assume there are no imports at the moment, and that we know the time of infection of the imports
tmp <- df.trm[, which(IDTR>0)]
cat(paste('\nFound transmissions within population, n=', length(tmp)))
tmp2 <- round(nrow(df.trm)*epi.import) - ( nrow(df.trm)-length(tmp)-1 )
tmp2 <- max(tmp2, 0)
cat(paste('\nRe-setting infecteds as index cases after imports, n=', tmp2))
stopifnot(length(tmp)>1)
stopifnot(length(tmp2)>=0)
tmp2 <- as.integer(sample( tmp, tmp2, replace=FALSE ))
# update df.trm
setkey(df.trm, TIME_TR)
set(df.trm, tmp2, 'IDTR', df.trm[, min(IDTR)]-rev(seq_along(tmp2)) )
if('TR_ACUTE'%in%colnames(df.trm))
set(df.trm, df.trm[, which(IDTR<0)], 'TR_ACUTE', NA_character_)
# update df.ind
tmp <- subset(df.trm, select=c(IDTR, IDTR_TIME_INFECTED))
setnames(tmp, c('IDTR', 'IDTR_TIME_INFECTED'), c('IDPOP', 'TIME_TR'))
tmp2 <- subset(df.trm, select=c(IDREC, TIME_TR))
setnames(tmp2, 'IDREC', 'IDPOP')
tmp <- rbind( tmp, tmp2 )
setkey(tmp, IDPOP)
df.ind <- merge( df.ind, unique(tmp, by='IDPOP'), by=c('IDPOP','TIME_TR'), all.x=TRUE, all.y=TRUE )
#
cat(paste('\nProportion of imported transmissions, p=', (nrow(subset(df.trm, IDTR<0))-1)/nrow(df.trm) ))
stopifnot( length(setdiff(df.trm[, IDTR], df.ind[, IDPOP]))==0 )
stopifnot( length(setdiff(df.trm[, IDREC], df.ind[, IDPOP]))==0 )
list(df.ind=df.ind, df.trm=df.trm)
}
##--------------------------------------------------------------------------------------------------------
# simulate guide to sequence sampling times. if not NA, then in every year, individuals in +-6mo to the guide are sampled
# olli originally written 26-01-2015
##--------------------------------------------------------------------------------------------------------
PANGEA.Seqsampler.SimulateGuideToSamplingTimes.v2<- function(df.ind, seqtime.mode)
{
stopifnot(grepl('DUnif|Exp|AtDiag|AtART|AtTrm|AtYear|Weibull',seqtime.mode))
cat(paste('\nUsing seqtime.mode=', seqtime.mode ))
if(grepl('Exp',seqtime.mode))
{
tmp <- as.numeric(substring(seqtime.mode,4))
stopifnot(is.finite(tmp))
cat(paste('\nPANGEA.Seqsampler.SimulateGuideToSamplingTimes.v2: avg time to sequencing since diagnosis=', tmp))
df.ind[, T1_SEQ:= rexp(nrow(df.ind), rate=1/tmp) + df.ind[,DIAG_T]]
# need T1_SEQ even when no diagnosis for archival samples
tmp <- df.ind[, which(is.na(T1_SEQ) & DOD-TIME_TR>0.5)]
set( df.ind, tmp, 'T1_SEQ', df.ind[tmp, runif(length(tmp), TIME_TR+0.5, DOD)] )
tmp <- df.ind[, which( is.na(DIAG_T) & T1_SEQ>min(DIAG_T, na.rm=1) )]
set( df.ind, tmp, 'T1_SEQ', NA_real_)
tmp <- df.ind[, which(T1_SEQ>ART1_T)]
set(df.ind, tmp, 'T1_SEQ', df.ind[tmp,ART1_T])
}
if(grepl('AtYear',seqtime.mode))
{
tmp <- as.numeric(substring(seqtime.mode,7))
stopifnot(is.finite(tmp))
cat(paste('\nPANGEA.Seqsampler.SimulateGuideToSamplingTimes.v2: sequencing years after infection=', tmp))
df.ind[, T1_SEQ:= tmp + df.ind[,TIME_TR]]
tmp <- df.ind[, which(!is.na(T1_SEQ) & DOD<T1_SEQ)]
set(df.ind, tmp, 'T1_SEQ', df.ind[tmp, (DOD-TIME_TR)*.9 + TIME_TR])
}
if(grepl('DUnif',seqtime.mode))
{
tmp <- as.numeric(substring(seqtime.mode,6))
stopifnot(is.finite(tmp))
cat(paste('\nPANGEA.Seqsampler.SimulateGuideToSamplingTimes.v2: max time to sequencing since diagnosis=', tmp))
df.ind[, T1_SEQ:= runif(nrow(df.ind), min=0, max=tmp) + df.ind[,DIAG_T]]
# need T1_SEQ even when no diagnosis for archival samples
tmp <- df.ind[, which(is.na(T1_SEQ) & DOD-TIME_TR>0.5)]
set( df.ind, tmp, 'T1_SEQ', df.ind[tmp, runif(length(tmp), TIME_TR+0.5, DOD)] )
tmp <- df.ind[, which( is.na(DIAG_T) & T1_SEQ>min(DIAG_T, na.rm=1) )]
set( df.ind, tmp, 'T1_SEQ', NA_real_)
tmp <- df.ind[, which(T1_SEQ>ART1_T)]
set(df.ind, tmp, 'T1_SEQ', df.ind[tmp,ART1_T])
}
if(seqtime.mode=='AtTrm')
{
df.ind[, T1_SEQ:= df.ind[, TIME_TR+0.1]]
tmp <- df.ind[, which(is.na(T1_SEQ) & DOD<T1_SEQ)]
set( df.ind, tmp, 'T1_SEQ', df.ind[tmp, TIME_TR+(TIME_TR+DOD)/2] )
tmp <- df.ind[, which(T1_SEQ>ART1_T)]
set(df.ind, tmp, 'T1_SEQ', df.ind[tmp,ART1_T])
}
if(seqtime.mode=='AtDiag')
{
df.ind[, T1_SEQ:= df.ind[, DIAG_T]]
tmp <- df.ind[, which(is.na(T1_SEQ) & DOD-TIME_TR>0.5)]
set( df.ind, tmp, 'T1_SEQ', df.ind[tmp, runif(length(tmp), TIME_TR+0.5, DOD)] ) # define a random sequencing time
tmp <- df.ind[, which( is.na(DIAG_T) & T1_SEQ>min(DIAG_T, na.rm=1) )] # no sequences before diagnosing started in 2000
set( df.ind, tmp, 'T1_SEQ', NA_real_)
tmp <- df.ind[, which(T1_SEQ>ART1_T)] # no sequences if the proposed date is after the first ART date
set(df.ind, tmp, 'T1_SEQ', df.ind[tmp,ART1_T])
}
if(grepl('AtDiagOrWeibull',seqtime.mode))
{
shape <- as.numeric( strsplit(seqtime.mode,'_')[[1]][2] )
scale <- as.numeric( strsplit(seqtime.mode,'_')[[1]][3] )
df.ind[, T1_SEQ:= df.ind[, DIAG_T]]
tmp <- df.ind[, which(is.na(T1_SEQ) & DOD-TIME_TR<3)]
set( df.ind, tmp, 'T1_SEQ', df.ind[tmp, runif(length(tmp), TIME_TR, DOD)] ) # define a random sequencing time
repeat{
tmp <- df.ind[, which(!is.na(TIME_TR) & is.na(T1_SEQ))]
cat('\tmissing T1_SEQ=', length(tmp))
set(df.ind, tmp, 'T1_SEQ', df.ind[tmp,TIME_TR]+rweibull(length(tmp), shape, scale))
tmp <- df.ind[, which(T1_SEQ>DOD | T1_SEQ>ART1_T )]
set(df.ind, tmp, 'T1_SEQ', NA_real_)
if(!length(tmp)) break
}
}
if(grepl('Weibull',seqtime.mode))
{
shape <- as.numeric( strsplit(seqtime.mode,'_')[[1]][2] )
scale <- as.numeric( strsplit(seqtime.mode,'_')[[1]][3] )
df.ind[, T1_SEQ:= NA_real_]
tmp <- df.ind[, which(is.na(T1_SEQ) & DOD-TIME_TR<3)]
set( df.ind, tmp, 'T1_SEQ', df.ind[tmp, runif(length(tmp), TIME_TR, DOD)] ) # define a random sequencing time
repeat{
tmp <- df.ind[, which(!is.na(TIME_TR) & is.na(T1_SEQ))]
cat('\tmissing T1_SEQ=', length(tmp))
set(df.ind, tmp, 'T1_SEQ', df.ind[tmp,TIME_TR]+rweibull(length(tmp), shape, scale))
tmp <- df.ind[, which(T1_SEQ>DOD | T1_SEQ>ART1_T )]
set(df.ind, tmp, 'T1_SEQ', NA_real_)
if(!length(tmp)) break
}
}
if(seqtime.mode=='AtART')
{
df.ind[, T1_SEQ:= df.ind[, ART1_T]]
tmp <- df.ind[, which(is.na(T1_SEQ) & DOD-TIME_TR>0.5)]
set( df.ind, tmp, 'T1_SEQ', df.ind[tmp, runif(length(tmp), TIME_TR+0.5, DOD)] )
tmp <- df.ind[, which( is.na(DIAG_T) & T1_SEQ>min(DIAG_T, na.rm=1) )]
set( df.ind, tmp, 'T1_SEQ', NA_real_)
tmp <- df.ind[, which(T1_SEQ>ART1_T)]
set(df.ind, tmp, 'T1_SEQ', df.ind[tmp,ART1_T])
}
df.ind
}
##--------------------------------------------------------------------------------------------------------
# sample proportional to untreated
#
# olli originally written 27-01-2015
##--------------------------------------------------------------------------------------------------------
PANGEA.Seqsampler.sample.prop.to.untreated<- function(df.ind, df.epi, pipeline.args, sp, verbose=1)
{
s.total <- round( df.epi[nrow(df.epi), PREV_EVER] * sp )
if(verbose)
{
cat(paste('\nSample proportional to untreated population'))
cat(paste('\nSampling sequences, target is n=', s.total))
}
# setup number of sequences to be sampled for each year
df.sample <- subset( df.epi, YR>= pipeline.args['yr.start',][, as.numeric(v)] & YR<pipeline.args['yr.end',][, as.numeric(v)] )
set(df.sample, NULL, 's.nTOTAL', as.numeric(rmultinom(1, s.total, df.sample[, PREV-TREATED]/ df.epi[, sum(PREV-TREATED)])) )
if(verbose)
cat(paste('\nSampling sequences, scheduled number is n=', df.sample[, sum(s.nTOTAL)]))
stopifnot(df.sample[, all(s.nTOTAL>=0)])
if(verbose)
cat(paste('\n prop of sequences sampled among HIV+=', df.sample[, sum( s.nTOTAL )] / df.sample[, rev(PREV_EVER)[1]]))
#
# SAMPLE INFECTED INDIVIDUALS BASED ON NUMBERS PER YEAR
#
# sample non-incident cases by year
df.inds <- copy(df.ind)
df.inds[, TIME_SEQ:= NA_real_]
for(yr in df.sample[, YR]) #TODO took out [-1] because there are s.n.notINC for DSPS in 1980
{
# of all infected and not incident and not yet sampled, sample
if(verbose)
cat(paste('\nadd samples in year',yr,', required=', subset( df.sample, YR==yr )[, s.nTOTAL]))
tmp <- df.inds[, which(is.na(TIME_SEQ) & floor(DOB)<=yr & ceiling(DOD)>yr+1 & yr-floor(TIME_TR)>=0)]
if(verbose)
cat(paste('\navailable non-sampled HIV+ individuals in year=', length(tmp)))
tmp <- df.inds[, which(is.na(TIME_SEQ) & floor(DOB)<=yr & ceiling(DOD)>yr+1 & yr-floor(T1_SEQ)==0)]
if(verbose)
cat(paste('\navailable non-sampled HIV+ individuals with suggested sampling date in year=', length(tmp)))
stopifnot(length(tmp)>0)
tmp <- sample(tmp, subset( df.sample, YR==yr )[, s.nTOTAL])
set( df.inds, tmp, 'TIME_SEQ', df.inds[tmp, T1_SEQ] )
}
stopifnot( df.inds[, !any(TIME_SEQ>DOD, na.rm=TRUE)] )
stopifnot( df.inds[, !any(TIME_SEQ<TIME_TR, na.rm=TRUE)] )
if(verbose)
{
cat(paste('\n total number of HIV+ in df.inds=', nrow(subset(df.inds, !is.na(TIME_TR)))))
cat(paste('\n total number of sampled HIV+ in df.inds=', nrow(subset(df.inds, !is.na(TIME_TR) & !is.na(TIME_SEQ)))))
}
list(df.inds=df.inds, df.sample=df.sample)
}
##--------------------------------------------------------------------------------------------------------
# sample proportional to infected
# olli originally written 03-04-2015
##--------------------------------------------------------------------------------------------------------
PANGEA.Seqsampler.sample.prop.to.T1SEQ<- function(df.ind, df.epi, pipeline.args, verbose=0)
{
sp <- pipeline.args['s.PREV.max',][, as.numeric(v)]
if(verbose)
{
cat(paste('\nSample proportional to T1SEQ'))
cat(paste('\nSampling fraction per year is n=', sp))
}
# setup number of sequences to be sampled for each year
df.sample <- subset( df.epi, YR>= pipeline.args['yr.start',][, as.numeric(v)] & YR<pipeline.args['yr.end',][, as.numeric(v)] )
set(df.sample, NULL, 's.nTOTAL', 0)
set(df.sample, NULL, 's.nTOTAL', round(df.sample[, T1_SEQ]*sp, d=0) )
#set(df.sample, NULL, 's.nTOTAL', rbinom(nrow(df.sample), df.sample[, T1_SEQ], sp) )
if(verbose)
cat(paste('\nSampling sequences, scheduled number is n=', df.sample[, sum(s.nTOTAL)]))
stopifnot(df.sample[, all(s.nTOTAL>=0)])
#
# SAMPLE INFECTED INDIVIDUALS BASED ON NUMBERS PER YEAR
#
# sample non-incident cases by year
df.inds <- copy(df.ind)
df.inds[, TIME_SEQ:= NA_real_]
for(yr in df.sample[, YR]) #TODO took out [-1] because there are s.n.notINC for DSPS in 1980
{
# of all infected and not incident and not yet sampled, sample
if(verbose)
cat(paste('\nadd samples in year',yr,', required=', subset( df.sample, YR==yr )[, s.nTOTAL]))
tmp <- df.inds[, which(is.na(TIME_SEQ) & floor(T1_SEQ)==yr & TIME_TR>=pipeline.args['yr.start',][, as.numeric(v)]) ]
if(verbose)
cat(paste('\navailable non-sampled HIV+ individuals in year=', length(tmp)))
stopifnot(length(tmp)>0)
tmp <- sample(tmp, subset( df.sample, YR==yr )[, s.nTOTAL])
if(verbose)
cat(paste('\nfound samples in year',yr,', required=', length(tmp)))
set( df.inds, tmp, 'TIME_SEQ', df.inds[tmp, T1_SEQ] )
}
stopifnot( df.inds[, !any(TIME_SEQ>DOD, na.rm=TRUE)] )
stopifnot( df.inds[, !any(TIME_SEQ<TIME_TR, na.rm=TRUE)] )
if(verbose)
{
cat(paste('\n total number of HIV+ in [yr.start,yr.end] in df.inds=', nrow(subset(df.inds, IDPOP>=0 & !is.na(TIME_TR) & floor(TIME_TR)>=pipeline.args['yr.start',][, as.numeric(v)] & TIME_TR<pipeline.args['yr.end',][, as.numeric(v)] ))))
cat(paste('\n total number of sampled HIV+ in [yr.start,yr.end] in df.inds=', nrow(subset(df.inds, IDPOP>=0 & !is.na(TIME_TR) & floor(TIME_TR)>=pipeline.args['yr.start',][, as.numeric(v)] & TIME_TR<pipeline.args['yr.end',][, as.numeric(v)] & !is.na(TIME_SEQ)))))
cat(paste('\n total number of non-sampled HIV+ in [yr.start,yr.end] in df.inds=', nrow(subset(df.inds, !is.na(TIME_TR) & is.na(TIME_SEQ) & IDPOP>=0 & TIME_TR<pipeline.args['yr.end',][, as.numeric(v)] & floor(TIME_TR)>=pipeline.args['yr.start',][, as.numeric(v)] ))))
}
list(df.inds=df.inds, df.sample=df.sample)
}
##--------------------------------------------------------------------------------------------------------
# sample proportional to diagnoses before and after interventions
# s% of those newly diagnosed per year until 2015
# 2*s% of those newly diagnosed per year after 2015
# from before then: 50 (uniform)
#
# olli originally written 27-01-2015
##--------------------------------------------------------------------------------------------------------
PANGEA.Seqsampler.sample.prop.to.diagnosis<- function(df.ind, df.epi, pipeline.args, sp)
{
s.total <- round( df.epi[nrow(df.epi), PREV] * sp )
s.archival.yr <- subset(df.epi, DIAG==0)[, tail(YR,1)]
s.diagb4intervention.n <- subset( df.epi, YR<pipeline.args['s.INTERVENTION.start',][, as.numeric(v)] )[, tail(DIAG,1)]
s.diagb4intervention <- (s.total-pipeline.args['s.ARCHIVAL.n',][, as.numeric(v)]) / (s.diagb4intervention.n+pipeline.args['s.INTERVENTION.mul'][, as.numeric(v)]*(df.epi[nrow(df.epi), DIAG]-s.diagb4intervention.n))
cat(paste('\nSample proportional to diagnoses up to intervention, and proportional to diagnoses after intervention start'))
cat(paste('\nSampling sequences, target is n=', s.total))
cat(paste('\nNo diagnoses up to yr, sampling archival sequences till then. yr=', s.archival.yr))
cat(paste('\nSampling archival sequences before any diagnoses, n=', pipeline.args['s.ARCHIVAL.n',][, as.numeric(v)]))
cat(paste('\nSampling sequences before intervention start from diagnosed, p=', s.diagb4intervention))
cat(paste('\nSampling sequences after intervention start from diagnosed, p=', 2*s.diagb4intervention))
# setup number of sequences to be sampled for each year
df.sample <- subset( df.epi, YR>= pipeline.args['yr.start',][, as.numeric(v)] & YR<pipeline.args['yr.end',][, as.numeric(v)] )
set(df.sample, NULL, 's.nTOTAL', 0)
tmp <- df.sample[, which(YR<=s.archival.yr)]
set(df.sample, tmp, 's.nTOTAL', as.numeric(rmultinom(1, pipeline.args['s.ARCHIVAL.n',][, as.numeric(v)], rep(1/length(tmp), length(tmp)))))
tmp <- df.sample[, which( YR>s.archival.yr & YR<pipeline.args['s.INTERVENTION.start',][, as.numeric(v)] )]
set(df.sample, tmp, 's.nTOTAL', as.numeric(rmultinom(1, round( s.diagb4intervention.n*s.diagb4intervention ), df.sample[tmp, NEW_DIAG]/df.sample[tmp, sum(NEW_DIAG)])) )
tmp <- df.sample[, which(YR>=pipeline.args['s.INTERVENTION.start',][, as.numeric(v)]) ]
set(df.sample, tmp, 's.nTOTAL', as.numeric(rmultinom(1, round( (df.sample[nrow(df.sample), DIAG]-s.diagb4intervention.n)*pipeline.args['s.INTERVENTION.mul'][, as.numeric(v)]*s.diagb4intervention ), df.sample[tmp, NEW_DIAG]/df.sample[tmp, sum(NEW_DIAG)])) )
cat(paste('\nSampling sequences, scheduled number is n=', df.sample[, sum(s.nTOTAL)]))
stopifnot(df.sample[, all(s.nTOTAL>=0)])
cat(paste('\n prop of sequences sampled among HIV+=', df.sample[, sum( s.nTOTAL )] / df.sample[, rev(PREV)[1]]))
stopifnot( abs(sp-df.sample[, sum( s.nTOTAL )] / df.sample[, rev(PREV)[1]])<=sp*0.1 )
#
# SAMPLE INFECTED INDIVIDUALS BASED ON NUMBERS PER YEAR
#
# sample non-incident cases by year
df.inds <- copy(df.ind)
df.inds[, TIME_SEQ:= NA_real_]
for(yr in df.sample[, YR]) #TODO took out [-1] because there are s.n.notINC for DSPS in 1980
{
# of all infected and not incident and not yet sampled, sample
cat(paste('\nadd samples in year',yr,', required=', subset( df.sample, YR==yr )[, s.nTOTAL]))
tmp <- df.inds[, which(is.na(TIME_SEQ) & floor(DOB)<=yr & ceiling(DOD)>yr+1 & yr-floor(TIME_TR)>=0)]
cat(paste('\navailable non-sampled HIV+ individuals in year=', length(tmp)))
tmp <- df.inds[, which(is.na(TIME_SEQ) & floor(DOB)<=yr & ceiling(DOD)>yr+1 & yr-floor(T1_SEQ)==0)]
cat(paste('\navailable non-sampled HIV+ individuals with suggested sampling date in year=', length(tmp)))
stopifnot(length(tmp)>0)
tmp <- sample(tmp, subset( df.sample, YR==yr )[, s.nTOTAL])
set( df.inds, tmp, 'TIME_SEQ', df.inds[tmp, T1_SEQ] )
}
stopifnot( df.inds[, !any(TIME_SEQ>DOD, na.rm=TRUE)] )
stopifnot( df.inds[, !any(TIME_SEQ<TIME_TR, na.rm=TRUE)] )
cat(paste('\n total number of HIV+ in df.inds=', nrow(subset(df.inds, !is.na(TIME_TR)))))
cat(paste('\n total number of sampled HIV+ in df.inds=', nrow(subset(df.inds, !is.na(TIME_TR) & !is.na(TIME_SEQ)))))
list(df.inds=df.inds, df.sample=df.sample)
}
##--------------------------------------------------------------------------------------------------------
# sample a fixed proportion after the start of the intervention (2015)
# after 2015, a fixed amount per year
# olli originally written 27-01-2015
##--------------------------------------------------------------------------------------------------------
PANGEA.Seqsampler.sample.fixed.to.prop<- function(df.ind, df.epi, pipeline.args, sp, verbose=1)
{
# SAMPLING PROBABILITIES and TOTALS PER YEAR
s.total <- round( df.epi[nrow(df.epi), PREV_EVER] * sp )
s.archival.yr <- subset(df.epi, DIAG==0)[, tail(YR,1)]
s.b4intervention.n <- round(s.total*(1-pipeline.args['s.INTERVENTION.prop',][, as.numeric(v)]))
stopifnot(s.b4intervention.n>=pipeline.args['s.ARCHIVAL.n',][, as.numeric(v)])
if(verbose)
{
cat(paste('\nSample a fixed proportion after the start of the intervention (2015), and of those a fixed amount per year'))
cat(paste('\nSampling sequences, target is n=', s.total))
cat(paste('\nNo diagnoses up to yr, sampling archival sequences till then. yr=', s.archival.yr))
cat(paste('\nSampling archival sequences before any diagnoses, n=', pipeline.args['s.ARCHIVAL.n',][, as.numeric(v)]))
cat(paste('\nSampling sequences before intervention start from diagnosed, n=', s.b4intervention.n))
cat(paste('\nSampling sequences after intervention start from diagnosed, n=', s.total - s.b4intervention.n))
}
# setup number of sequences to be sampled for each year
df.sample <- subset( df.epi, YR>= pipeline.args['yr.start',][, as.numeric(v)] & YR<pipeline.args['yr.end',][, as.numeric(v)] )
set(df.sample, NULL, 's.nTOTAL', 0)
tmp <- df.sample[, which(YR<=s.archival.yr)]
set(df.sample, tmp, 's.nTOTAL', as.numeric(rmultinom(1, pipeline.args['s.ARCHIVAL.n',][, as.numeric(v)], rep(1/length(tmp), length(tmp)))))
if(verbose)
cat(paste('\nSampling sequences from archival, scheduled number is n=', df.sample[tmp, sum(s.nTOTAL)]))
tmp <- df.sample[, which( YR>s.archival.yr & YR<pipeline.args['s.INTERVENTION.start',][, as.numeric(v)] )]
set(df.sample, tmp, 's.nTOTAL', as.numeric(rmultinom(1, s.b4intervention.n-pipeline.args['s.ARCHIVAL.n',][, as.numeric(v)], df.sample[tmp, NEW_DIAG]/df.sample[tmp, sum(NEW_DIAG)])) )
if(verbose)
cat(paste('\nSampling sequences before intervention, scheduled number is n=', df.sample[tmp, sum(s.nTOTAL)]))
tmp <- df.sample[, which(YR>=pipeline.args['s.INTERVENTION.start',][, as.numeric(v)]) ]
set(df.sample, tmp, 's.nTOTAL', as.numeric(rmultinom(1, s.total - s.b4intervention.n, rep(1/length(tmp),length(tmp))) ) )
if(verbose)
cat(paste('\nSampling sequences after intervention, scheduled number is n=', df.sample[tmp, sum(s.nTOTAL)]))
stopifnot(df.sample[, all(s.nTOTAL>=0)])
#stopifnot( abs(sp-df.sample[, sum( s.nTOTAL )] / df.sample[, rev(PREV)[1]])<=sp*0.1 )
#
# SAMPLE INFECTED INDIVIDUALS BASED ON NUMBERS PER YEAR
#
# sample non-incident cases by year
df.inds <- copy(df.ind)
df.inds[, TIME_SEQ:= NA_real_]
for(yr in df.sample[, YR]) #TODO took out [-1] because there are s.n.notINC for DSPS in 1980
{
# of all infected and not incident and not yet sampled, sample
if(verbose)
cat(paste('\nadd samples in year',yr,', required=', subset( df.sample, YR==yr )[, s.nTOTAL]))
tmp <- df.inds[, which(is.na(TIME_SEQ) & floor(DOB)<=yr & ceiling(DOD)>yr+1 & yr-floor(TIME_TR)>=0)]
if(verbose)
cat(paste('\navailable non-sampled HIV+ individuals in year=', length(tmp)))
tmp <- df.inds[, which(is.na(TIME_SEQ) & floor(DOB)<=yr & ceiling(DOD)>yr+1 & yr-floor(T1_SEQ)==0)]
if(verbose)
cat(paste('\navailable non-sampled HIV+ individuals with suggested sampling date in year=', length(tmp)))
k <- 1
while(length(tmp)<subset( df.sample, YR==yr )[, s.nTOTAL])
{
if(verbose)
cat(paste('\nCannot find samples, fall back to ',k,'th previous year; n=', subset( df.sample, YR==yr )[, s.nTOTAL]-length(tmp) ))
tmp2 <- df.inds[, which(is.na(TIME_SEQ) & floor(DOB)<=yr & ceiling(DOD)>yr+1 & yr-floor(T1_SEQ)==k & (is.na(ART1_T) | ART1_T-yr<=1)) ]
if(verbose)
cat(paste('\navailable non-sampled HIV+ individuals with suggested sampling date one year before=', length(tmp2)))
tmp2 <- sample(tmp2, min(subset( df.sample, YR==yr )[, s.nTOTAL]-length(tmp), length(tmp2)) )
tmp <- c(tmp, tmp2)
k <- k+1
stopifnot(k<=10)
}
stopifnot(length(tmp)>=subset( df.sample, YR==yr )[, s.nTOTAL])
tmp <- sample(tmp, subset( df.sample, YR==yr )[, s.nTOTAL])
set( df.inds, tmp, 'TIME_SEQ', df.inds[tmp, T1_SEQ+yr-floor(T1_SEQ)] )
}
stopifnot( df.inds[, !any(TIME_SEQ>DOD, na.rm=TRUE)] )
stopifnot( df.inds[, !any(TIME_SEQ<TIME_TR, na.rm=TRUE)] )
if(verbose)
{
cat(paste('\n total number of HIV+ in df.inds=', nrow(subset(df.inds, !is.na(TIME_TR)))))
cat(paste('\n total number of sampled HIV+ in df.inds=', nrow(subset(df.inds, !is.na(TIME_TR) & !is.na(TIME_SEQ)))))
}
list(df.inds=df.inds, df.sample=df.sample)
}
##--------------------------------------------------------------------------------------------------------
# sample proportional to diagnoses before and after interventions
# s% of those newly diagnosed per year until 2015
# from before any diagnosed: 50 (uniform)
# after 2015, a fixed amount per year
# olli originally written 27-01-2015
##--------------------------------------------------------------------------------------------------------
PANGEA.Seqsampler.sample.prop.to.diagnosis.b4intervention<- function(df.ind, df.epi, pipeline.args, sp, verbose=1)
{
# SAMPLING PROBABILITIES and TOTALS PER YEAR
s.total <- round( df.epi[nrow(df.epi), PREV_EVER] * sp )
s.archival.yr <- subset(df.epi, DIAG==0)[, tail(YR,1)]
s.diagb4intervention.n <- subset( df.epi, YR<pipeline.args['s.INTERVENTION.start',][, as.numeric(v)] )[, tail(DIAG,1)]
s.diagb4intervention <- (s.total-pipeline.args['s.ARCHIVAL.n',][, as.numeric(v)]) / (s.diagb4intervention.n+pipeline.args['s.INTERVENTION.mul'][, as.numeric(v)]*(df.epi[nrow(df.epi), DIAG]-s.diagb4intervention.n))
if(verbose)
{
cat(paste('\nSample proportional to diagnoses up to intervention, and then a fixed amount per year'))
cat(paste('\nSampling sequences, target is n=', s.total))
cat(paste('\nNo diagnoses up to yr, sampling archival sequences till then. yr=', s.archival.yr))
cat(paste('\nSampling archival sequences before any diagnoses, n=', pipeline.args['s.ARCHIVAL.n',][, as.numeric(v)]))
cat(paste('\nNumber of diagnoses before intervention, n=', s.diagb4intervention.n))
cat(paste('\nNumber of diagnoses after intervention start, n=', df.epi[nrow(df.epi), DIAG]-s.diagb4intervention.n))
cat(paste('\nSampling sequences before intervention start from diagnosed, p=', s.diagb4intervention))
cat(paste('\nSampling sequences after intervention start from diagnosed, n=', round(s.diagb4intervention*pipeline.args['s.INTERVENTION.mul'][, as.numeric(v)]*(df.epi[nrow(df.epi), DIAG]-s.diagb4intervention.n))))
}
# setup number of sequences to be sampled for each year
df.sample <- subset( df.epi, YR>= pipeline.args['yr.start',][, as.numeric(v)] & YR<pipeline.args['yr.end',][, as.numeric(v)] )
set(df.sample, NULL, 's.nTOTAL', 0)
tmp <- df.sample[, which(YR<=s.archival.yr)]
set(df.sample, tmp, 's.nTOTAL', as.numeric(rmultinom(1, pipeline.args['s.ARCHIVAL.n',][, as.numeric(v)], rep(1/length(tmp), length(tmp)))))
if(verbose)
cat(paste('\nSampling sequences from archival, scheduled number is n=', df.sample[tmp, sum(s.nTOTAL)]))
tmp <- df.sample[, which( YR>s.archival.yr & YR<pipeline.args['s.INTERVENTION.start',][, as.numeric(v)] )]
set(df.sample, tmp, 's.nTOTAL', as.numeric(rmultinom(1, round( s.diagb4intervention.n*s.diagb4intervention ), df.sample[tmp, NEW_DIAG]/df.sample[tmp, sum(NEW_DIAG)])) )
if(verbose)
cat(paste('\nSampling sequences before intervention, scheduled number is n=', df.sample[tmp, sum(s.nTOTAL)]))
tmp <- df.sample[, which(YR>=pipeline.args['s.INTERVENTION.start',][, as.numeric(v)]) ]
set(df.sample, tmp, 's.nTOTAL', as.numeric(rmultinom(1, round( (df.sample[nrow(df.sample), DIAG]-s.diagb4intervention.n)*pipeline.args['s.INTERVENTION.mul'][, as.numeric(v)]*s.diagb4intervention ), rep(1/length(tmp),length(tmp))) ) )
if(verbose)
cat(paste('\nSampling sequences after intervention, scheduled number is n=', df.sample[tmp, sum(s.nTOTAL)]))
stopifnot(df.sample[, all(s.nTOTAL>=0)])
#stopifnot( abs(sp-df.sample[, sum( s.nTOTAL )] / df.sample[, rev(PREV)[1]])<=sp*0.1 )
#
# SAMPLE INFECTED INDIVIDUALS BASED ON NUMBERS PER YEAR
#
# sample non-incident cases by year
df.inds <- copy(df.ind)
df.inds[, TIME_SEQ:= NA_real_]
for(yr in df.sample[, YR]) #TODO took out [-1] because there are s.n.notINC for DSPS in 1980
{
# of all infected and not incident and not yet sampled, sample
if(verbose)
cat(paste('\nadd samples in year',yr,', required=', subset( df.sample, YR==yr )[, s.nTOTAL]))
tmp <- df.inds[, which(is.na(TIME_SEQ) & floor(DOB)<=yr & ceiling(DOD)>yr+1 & yr-floor(TIME_TR)>=0)]
if(verbose)
cat(paste('\navailable non-sampled HIV+ individuals in year=', length(tmp)))
tmp <- df.inds[, which(is.na(TIME_SEQ) & floor(DOB)<=yr & ceiling(DOD)>yr+1 & yr-floor(T1_SEQ)==0)]
if(verbose)
cat(paste('\navailable non-sampled HIV+ individuals with suggested sampling date in year=', length(tmp)))
k <- 1
while(length(tmp)<subset( df.sample, YR==yr )[, s.nTOTAL])
{
if(verbose)
cat(paste('\nCannot find samples, fall back to ',k,'th previous year; n=', subset( df.sample, YR==yr )[, s.nTOTAL]-length(tmp) ))
tmp2 <- df.inds[, which(is.na(TIME_SEQ) & floor(DOB)<=yr & ceiling(DOD)>yr+1 & yr-floor(T1_SEQ)==k & (is.na(ART1_T) | ART1_T-yr<=1)) ]
if(verbose)
cat(paste('\navailable non-sampled HIV+ individuals with suggested sampling date one year before=', length(tmp2)))
tmp2 <- sample(tmp2, min(subset( df.sample, YR==yr )[, s.nTOTAL]-length(tmp), length(tmp2)) )
tmp <- c(tmp, tmp2)
k <- k+1
stopifnot(k<=10)
}
stopifnot(length(tmp)>=subset( df.sample, YR==yr )[, s.nTOTAL])
tmp <- sample(tmp, subset( df.sample, YR==yr )[, s.nTOTAL])
set( df.inds, tmp, 'TIME_SEQ', df.inds[tmp, T1_SEQ+yr-floor(T1_SEQ)] )
}
stopifnot( df.inds[, !any(TIME_SEQ>DOD, na.rm=TRUE)] )
stopifnot( df.inds[, !any(TIME_SEQ<TIME_TR, na.rm=TRUE)] )
if(verbose)
{
cat(paste('\n total number of HIV+ in df.inds=', nrow(subset(df.inds, !is.na(TIME_TR)))))
cat(paste('\n total number of sampled HIV+ in df.inds=', nrow(subset(df.inds, !is.na(TIME_TR) & !is.na(TIME_SEQ)))))
}
list(df.inds=df.inds, df.sample=df.sample)
}
##--------------------------------------------------------------------------------------------------------
# return ancestral sequence sampler
# olli originally written 27-01-2015
##--------------------------------------------------------------------------------------------------------
PANGEA.Seqsampler.v4<- function(df.ind, df.trm, pipeline.args, outfile.ind, outfile.trm, with.plot=1)
{
stopifnot(grepl('Fixed2Prop|Prop2DiagB4I|Prop2Diag|Prop2Untreated|Prop2SuggestedSampling',pipeline.args['s.MODEL',][, v]))
# compute prevalence and incidence by year
epi.adult <- 13
suppressWarnings( df.trm[, YR:= df.trm[, floor(TIME_TR)]] )
df.epi <- df.trm[, list(INC=length(IDREC), INC_ACUTE=length(which(TR_ACUTE=='Yes')),IMPORT=length(which(IDTR<0))), by='YR']
tmp <- df.epi[, {
sexactive <- which( floor(df.ind[['DOB']]+epi.adult)<=YR & ceiling(df.ind[['DOD']])>YR )
infected.ever <- which( floor(df.ind[['TIME_TR']])<=YR )
infected <- which( floor(df.ind[['DOB']])<=YR & floor(df.ind[['DOD']])>YR & floor(df.ind[['TIME_TR']])<=YR )
diag <- which( floor(df.ind[['DOB']])<=YR & floor(df.ind[['DOD']])>YR & floor(df.ind[['DIAG_T']])<=YR )
diag.new <- which( floor(df.ind[['DIAG_T']])==YR )
treated <- which( floor(df.ind[['DOB']])<=YR & floor(df.ind[['DOD']])>YR & floor(df.ind[['ART1_T']])<=YR & (is.na(df.ind[['VLS1_TE']]) | floor(df.ind[['VLS1_TE']])>YR) )
infdead <- which( floor(df.ind[['DOD']])==YR & floor(df.ind[['TIME_TR']])<=YR )
suggsampling <- which( floor(df.ind[['T1_SEQ']])==YR & floor(df.ind[['TIME_TR']])>=pipeline.args['yr.start',][, as.numeric(v)])
list(POP=length(sexactive), PREV=length(infected), PREV_EVER=length(infected.ever), PREVDIED=length(infdead), DIAG=length(diag), NEW_DIAG=length(diag.new), TREATED=length(treated), T1_SEQ=length(suggsampling))
},by='YR']
df.epi <- merge( tmp, df.epi, by='YR' )
set(df.epi, NULL, 'PREVp', df.epi[, PREV/(POP-PREV)])
set(df.epi, NULL, 'INCp', df.epi[, INC/(POP-PREV)])
set(df.epi, NULL, 'IMPORTp', df.epi[, IMPORT/INC])
set(df.epi, NULL, 'ACUTEp', df.epi[, INC_ACUTE/INC])
set(df.epi, NULL, 'ARTcov', df.epi[, TREATED/PREV])
set(df.epi, NULL, 'UNDIAGp', df.epi[, (PREV-DIAG)/PREV])
set(df.epi, NULL, 'GROWTHr', c(NA_real_, df.epi[, diff(log(PREV))]))
stopifnot( !is.na(pipeline.args['s.PREV.max.n',][, as.numeric(v)]) | !is.na(pipeline.args['s.PREV.max',][, as.numeric(v)]) )
s.PREV.max.guess <- pipeline.args['s.PREV.max.n',][, as.numeric(v)] / df.epi[nrow(df.epi), PREV_EVER]
if(is.na(s.PREV.max.guess) & pipeline.args['s.MODEL',][, v]=='Prop2DiagB4I')
{
# calibration runs to determine sampling coverage
# set(pipeline.args, pipeline.args[, which(stat=='s.PREV.max')], 'v', '0.08')
s.PREV.max.guess <- seq( pipeline.args['s.PREV.max',][, as.numeric(v)]*0.4, pipeline.args['s.PREV.max',][, as.numeric(v)]*0.6, length.out=40)
tmp <- sapply(s.PREV.max.guess, function(x)
{
cat(paste('\ntry s.PREV.max.guess=',x))
tmp <- PANGEA.Seqsampler.sample.prop.to.diagnosis.b4intervention(df.ind, df.epi, pipeline.args, x, verbose=0)
tmp2 <- subset( tmp$df.inds, DOD>pipeline.args['s.INTERVENTION.start',][, as.numeric(v)] & floor(TIME_TR)<pipeline.args['s.INTERVENTION.start',][, as.numeric(v)])
sc.alive15.infl15 <- tmp2[, length(which(!is.na(TIME_SEQ)))/length(TIME_SEQ) ]
tmp2 <- subset( tmp$df.inds, DOD>pipeline.args['s.INTERVENTION.start',][, as.numeric(v)] & floor(TIME_TR)<pipeline.args['yr.end',][, as.numeric(v)])
sc.alive15.infl20 <- tmp2[, length(which(!is.na(TIME_SEQ)))/length(TIME_SEQ) ]
tmp2 <- subset( tmp$df.inds, DOD>pipeline.args['yr.end',][, as.numeric(v)] & floor(TIME_TR)<pipeline.args['yr.end',][, as.numeric(v)])
sc.alive20.infl20 <- tmp2[, length(which(!is.na(TIME_SEQ)))/length(TIME_SEQ) ]
tmp2 <- subset( tmp$df.inds, floor(TIME_TR)>=2000 & floor(TIME_TR)<pipeline.args['yr.end',][, as.numeric(v)])
sc.infg99l20 <- tmp2[, length(which(!is.na(TIME_SEQ)))/length(TIME_SEQ) ]
tmp2 <- subset( tmp$df.inds, floor(TIME_TR)>=pipeline.args['s.INTERVENTION.start',][, as.numeric(v)] & floor(TIME_TR)<pipeline.args['yr.end',][, as.numeric(v)])
sc.infg14l20 <- tmp2[, length(which(!is.na(TIME_SEQ)))/length(TIME_SEQ) ]
sn.g14 <- subset( tmp$df.inds, TIME_SEQ>=pipeline.args['s.INTERVENTION.start',][, as.numeric(v)] & floor(TIME_TR)<pipeline.args['yr.end',][, as.numeric(v)])[, length(which(!is.na(TIME_SEQ))) ]
sn.total <- subset(tmp$df.inds, floor(TIME_TR)<pipeline.args['yr.end',][, as.numeric(v)])[, length(which(!is.na(TIME_SEQ))) ]
sp.g14 <- sn.g14 / sn.total
c(sc.alive15.infl15=sc.alive15.infl15, sc.alive15.infl20=sc.alive15.infl20, sc.alive20.infl20=sc.alive20.infl20, sc.infg99l20=sc.infg99l20, sc.infg14l20=sc.infg14l20, sn.g14=sn.g14, sn.total=sn.total, sp.g14=sp.g14)
})
tmp <- as.data.table(t(tmp))
tmp[, s.PREV.max.guess:=s.PREV.max.guess]
# use best guess
s.PREV.max.guess <- tmp[which.min(abs(sc.alive20.infl20-pipeline.args['s.PREV.max',][, as.numeric(v)])), ][, s.PREV.max.guess]
}
if(!is.na(s.PREV.max.guess))
cat(paste('\nFound best s.PREV.max.guess=',s.PREV.max.guess))
if(pipeline.args['s.MODEL',][, v]=='Prop2SuggestedSampling')
tmp <- PANGEA.Seqsampler.sample.prop.to.T1SEQ(df.ind, df.epi, pipeline.args, verbose=1)
if(pipeline.args['s.MODEL',][, v]=='Fixed2Prop')
tmp <- PANGEA.Seqsampler.sample.fixed.to.prop(df.ind, df.epi, pipeline.args, s.PREV.max.guess, verbose=1)
if(pipeline.args['s.MODEL',][, v]=='Prop2DiagB4I')
tmp <- PANGEA.Seqsampler.sample.prop.to.diagnosis.b4intervention(df.ind, df.epi, pipeline.args, s.PREV.max.guess, verbose=1)
if(pipeline.args['s.MODEL',][, v]=='Prop2Diag')
tmp <- PANGEA.Seqsampler.sample.prop.to.diagnosis(df.ind, df.epi, pipeline.args, s.PREV.max.guess, verbose=1)
if(pipeline.args['s.MODEL',][, v]=='Prop2Untreated')
tmp <- PANGEA.Seqsampler.sample.prop.to.untreated(df.ind, df.epi, pipeline.args, s.PREV.max.guess, verbose=1)
df.inds <- copy(tmp$df.inds)
df.sample <- copy(tmp$df.sample)
#
tmp2 <- df.epi[, {
tmp2 <- df.inds[ which( df.inds$DOD>YR & floor(df.inds$TIME_TR)<YR ),]
list(SCOV= ifelse(nrow(tmp2)==0, 0, tmp2[, length(which(!is.na(TIME_SEQ)))/length(TIME_SEQ) ]))
} , by='YR']
sc.alive20.infl20 <- subset(tmp2, YR==pipeline.args['yr.end',][, as.numeric(v)]-1)[, SCOV]
df.sample <- merge(df.sample, tmp2, by='YR')
tmp2 <- subset( df.inds, floor(TIME_TR)>=2000 & floor(TIME_TR)<pipeline.args['yr.end',][, as.numeric(v)])
sc.infg99l20 <- tmp2[, length(which(!is.na(TIME_SEQ)))/length(TIME_SEQ) ]
sn.g14 <- subset( df.inds, TIME_SEQ>=pipeline.args['s.INTERVENTION.start',][, as.numeric(v)] & floor(TIME_TR)<pipeline.args['yr.end',][, as.numeric(v)])[, length(which(!is.na(TIME_SEQ))) ]
sn.total <- subset(df.inds, floor(TIME_TR)<pipeline.args['yr.end',][, as.numeric(v)])[, length(which(!is.na(TIME_SEQ))) ]
cat(paste('\ncoverage alive20.infl20=', sc.alive20.infl20))
cat(paste('\ncoverage infg99l20=', sc.infg99l20))
cat(paste('\nseqs sn.g14=', sn.g14))
cat(paste('\nseqs all=', sn.total))
# set sampling in df.trm
tmp <- subset( df.inds, !is.na(TIME_SEQ), select=c(IDPOP, TIME_SEQ) )
setnames(tmp, c('IDPOP','TIME_SEQ'), c('IDREC','SAMPLED_REC'))
df.trms <- merge(df.trm, tmp, by='IDREC', all.x=TRUE)
setnames(tmp, c('IDREC','SAMPLED_REC'), c('IDTR','SAMPLED_TR'))
df.trms <- merge(df.trms, tmp, by='IDTR', all.x=TRUE)
#
# seroprevalence survey
#
df.sp <- PANGEA.SeroPrevalenceSurvey(df.inds, epi.adult=epi.adult, s.INTERVENTION.start=pipeline.args['s.INTERVENTION.start', ][, as.numeric(v)], sp.prop.of.sexactive=pipeline.args['sp.prop.of.sexactive', ][, as.numeric(v)], sp.times=c(12, 8, 4, 0) )
file <- gsub('IND.csv','CROSS_SECTIONAL_SURVEY.csv', outfile.ind)
cat(paste('\nwrite to file', file))
write.csv(df.sp, file=file)
#
# TRANSMISSION NETWORKS
#
# cluster with index case
tmp <- subset(df.trms, select=c(IDTR, IDREC))
tmp <- graph.data.frame(tmp, directed=TRUE, vertices=NULL)
tmp <- data.table(IDPOP=as.integer(V(tmp)$name), CLU=clusters(tmp, mode="weak")$membership)
tmp2 <- tmp[, list(CLU_SIZE=length(IDPOP)), by='CLU']
setkey(tmp2, CLU_SIZE)
tmp2[, IDCLU:=rev(seq_len(nrow(tmp2)))]
tmp <- subset( merge(tmp, tmp2, by='CLU'), select=c(IDPOP, IDCLU) )
setnames(tmp, 'IDPOP', 'IDREC')
df.trms <- merge( df.trms, tmp, by='IDREC', all.x=TRUE )
stopifnot( nrow(subset(df.trms, is.na(IDCLU)))==0 )
cat(paste('\nFound transmission clusters, n=', df.trms[, length(unique(IDCLU))]))
# add IDCLU to df.inds
tmp <- subset( df.trms, select=c(IDREC, IDTR, IDCLU) )
tmp <- subset( melt(tmp, id.var='IDCLU', value.name='IDPOP'), select=c(IDPOP, IDCLU))
setkey(tmp, IDPOP, IDCLU)
tmp <- unique(tmp, by=c('IDPOP','IDCLU'))
df.inds <- merge( df.inds, tmp, by='IDPOP', all.x=TRUE )
#
# PLOTS
#
if(with.plot)
{
# plot numbers sampled, prevalent, incident
tmp <- names(df.sample)[ which(as.character(sapply(df.sample, class))!='numeric') ]
for(x in tmp)
set(df.sample, NULL, x, as.numeric(df.sample[[x]]))
tmp <- melt(df.sample, id.vars=c('YR'))
set(tmp, NULL, 'variable', tmp[, factor(variable, levels=tmp[, unique(variable)], labels=tmp[, unique(variable)])])
ggplot(tmp, aes(x=YR, y=value, group=variable)) + geom_step() +
facet_grid(variable~., scales='free_y') +
theme_bw() + scale_x_continuous(name='year', breaks=seq(1980,pipeline.args['yr.end',][, as.numeric(v)],2)) +
theme(strip.text=element_text(size=7))
file<- paste(substr(outfile.ind, 1, nchar(outfile.ind)-7),'INFO_Totals.pdf',sep='')
cat(paste('\nPlotting to file',file))
ggsave(file=file, w=16, h=16)
# plot CD4 counts at time of diagnosis
tmp <- subset(df.inds, !is.na(DIAG_T), select=c(DIAG_T, DIAG_CD4))
tmp[, YR:= floor(DIAG_T)]
tmp <- merge(tmp, tmp[, list(DIAG_CD4_ME=mean(DIAG_CD4)), by='YR'], by='YR')
setkey(tmp, YR)
ggplot(tmp) + geom_point(aes(x=DIAG_T, y=DIAG_CD4)) + geom_step(data=unique(tmp, by='YR'), aes(x=YR, y=DIAG_CD4_ME), col='red', size=1.5) + scale_y_continuous(breaks=seq(100,1000,100))
file<- paste(substr(outfile.ind, 1, nchar(outfile.ind)-7),'INFO_CD4.pdf',sep='')
cat(paste('\nPlotting to file',file))
ggsave(file=file, w=8, h=8)
ggplot(subset(df.inds, !is.na(DIAG_T)), aes(x=floor(DIAG_T), fill=cut(DIAG_CD4, breaks=c(0,200,350,500,700,2000)))) + geom_bar(binwidth=1, position='fill') +
labs(fill='CD4 category', y='percent of diagnosed', x='year diagnosed') + scale_y_continuous(breaks=seq(0,1,0.1)) + scale_fill_brewer(palette='Set1')
file<- paste(substr(outfile.ind, 1, nchar(outfile.ind)-7),'INFO_CD4PROP_AMONGDIAG.pdf',sep='')
cat(paste('\nPlotting to file',file))
ggsave(file=file, w=8, h=8)
ggplot(subset(df.inds, !is.na(DIAG_T) & !is.na(TIME_SEQ)), aes(x=floor(DIAG_T), fill=cut(DIAG_CD4, breaks=c(0,200,350,500,700,2000)))) + geom_bar(binwidth=1, position='fill') +
labs(fill='CD4 category', y='percent of sequenced', x='year diagnosed') + scale_y_continuous(breaks=seq(0,1,0.1)) + scale_fill_brewer(palette='Set1')
file<- paste(substr(outfile.ind, 1, nchar(outfile.ind)-7),'INFO_CD4PROP_AMONGSEQ.pdf',sep='')
cat(paste('\nPlotting to file',file))
ggsave(file=file, w=8, h=8)
# plot distribution between transmission time and sequencing time
ggplot(subset(df.inds, !is.na(TIME_SEQ)), aes(x=TIME_SEQ-TIME_TR)) + geom_histogram(binwidth=1) +
scale_x_continuous(name='time from transmission to sequence sampling\n(years)', breaks=seq(0,100,2))
file<- paste(substr(outfile.ind, 1, nchar(outfile.ind)-7),'INFO_Time2Seq.pdf',sep='')
ggsave(file=file, w=8, h=8)
# plot distribution between transmission time and diagnosis
ggplot(subset(df.inds, !is.na(DIAG_T)), aes(x=DIAG_T-TIME_TR)) + geom_histogram(binwidth=1) +
scale_x_continuous(name='time from transmission to diagnosis\n(years)', breaks=seq(0,100,2))
file<- paste(substr(outfile.ind, 1, nchar(outfile.ind)-7),'INFO_Time2Diag.pdf',sep='')
ggsave(file=file, w=8, h=8)
# plot distribution of #recipients per infector
tmp <- df.trms[, list(N= length(IDREC)), by='IDTR']
ggplot(tmp, aes(x=N)) + geom_histogram(binwidth=1, aes(y= ..density..)) +
scale_x_continuous(name='recipients per source case\n(number)', breaks=seq(1,100,1)+0.5, label=seq(1,100,1)) +
scale_y_continuous(breaks=seq(0,1,0.05))
file<- paste(substr(outfile.ind, 1, nchar(outfile.ind)-7),'INFO_RecSource.pdf',sep='')
ggsave(file=file, w=8, h=8)
# plot time to death for infected
tmp <- subset(df.inds, !is.na(TIME_TR) & IDPOP>0 & DOD<max(DOD, na.rm=1), select=c(TIME_TR, DOD))
ggplot(tmp, aes(x=DOD-TIME_TR)) + geom_histogram(binwidth=1, aes(y= ..density..)) +
scale_x_continuous(name='time to death for HIV+\n(years)', breaks=seq(1,100,2))
file<- paste(substr(outfile.ind, 1, nchar(outfile.ind)-7),'INFO_T2DeathForInf.pdf',sep='')
ggsave(file=file, w=8, h=8)
# plot transmission network
if(0)
{
file <- paste(substr(outfile.ind, 1, nchar(outfile.ind)-7),'INFO_TrNetworks.pdf',sep='')
cat(paste('\nPlotting to file',file))
pdf(file=file, w=20, h=20)
dummy <- sapply( df.inds[, sort(na.omit(unique(IDCLU)))], function(clu)
{
cat(paste('\nprocess cluster no',clu))
tmp <- subset(df.inds, IDCLU==clu & IDPOP>=0, select=c(IDPOP, GENDER, TIME_SEQ))
tmp[, IS_SEQ:= tmp[, factor(!is.na(TIME_SEQ), label=c('N','Y'), levels=c(FALSE, TRUE))]]
clu.igr <- graph.data.frame(subset(df.trms, IDCLU==clu & IDTR>=0, select=c(IDTR, IDREC)), directed=TRUE, vertices=subset(tmp, select=c(IDPOP, GENDER, IS_SEQ)))
V(clu.igr)$color <- ifelse( get.vertex.attribute(clu.igr, 'IS_SEQ')=='Y', 'green', 'grey90' )
V(clu.igr)$shape <- ifelse( get.vertex.attribute(clu.igr, 'GENDER')=='M', 'circle', 'square' )
par(mai=c(0,0,1,0))
plot(clu.igr, main=paste('IDCLU=',clu,sep=''), vertex.size=2, vertex.label.cex=0.25, edge.arrow.size=0.5, layout=layout.fruchterman.reingold(clu.igr, niter=1e3) )
legend('bottomright', fill=c('green','grey90'), legend=c('sequence sampled','sequence not sampled'), bty='n')
legend('bottomleft', legend=c('square= Female','circle= Male'), bty='n')
})
dev.off()
}
#ggplot(df.trms, aes(x=IDTR, y=TIME_TR)) + geom_point()
#ggplot(df.trms, aes(x=IDTR, y=IDCLU)) + geom_point()
}
#
# SAVE SAMPLED RECIPIENTS AND TRANSMISSIONS TO SAMPLED RECIPIENTS
#
# save for us
file <- paste(substr(outfile.ind, 1, nchar(outfile.ind)-7),'SAVE.R',sep='')
save(file=file, df.epi, df.trms, df.inds, df.sample, df.sp)
# save for virus tree simulator
# exclude columns that are not needed
df.inds <- subset(df.inds, !is.na(TIME_TR))
if('RISK'%in%colnames(df.inds))
df.inds[, RISK:=NULL]
if('INCIDENT_SEQ'%in%colnames(df.inds))
df.inds[, INCIDENT_SEQ:=NULL]
if('TIME_SEQYR'%in%colnames(df.inds))
df.inds[, TIME_SEQYR:=NULL]
if('TR_ACUTE'%in%colnames(df.trms))
df.trms[, TR_ACUTE:=NULL]
if('YR'%in%colnames(df.trms))
df.trms[, YR:=NULL]
# add columns that the virus tree simulator needs
if(!'IDTR_TIME_INFECTED'%in%colnames(df.trms))
{
tmp <- subset( df.inds, !is.na(TIME_TR), c(IDPOP, TIME_TR) )
setnames(tmp, c('IDPOP','TIME_TR'), c('IDTR','IDTR_TIME_INFECTED') )
df.trms <- merge(df.trms, tmp, by='IDTR', all.x=TRUE)
}
cat(paste('\nwrite to file',outfile.ind))
write.csv(file=outfile.ind, df.inds)
cat(paste('\nwrite to file',outfile.trm))
write.csv(file=outfile.trm, df.trms)
}
##--------------------------------------------------------------------------------------------------------
# return ancestral sequence sampler
# olli originally written 20-09-2014
##--------------------------------------------------------------------------------------------------------
#' @title Create starting sequence sampler
#' @description Returns a function and function arguments to draw ancestral sequences.
#' @param root.ctime.grace Sample a starting sequence with time that matches a query times +- this grace
#' @param sample.grace Internal parameter to make sure the requested number of samples is obtained. Internally oversample by this multiplier to the sample size, and then check if sequences are unique.
#' @return list of the sampler \code{rANCSEQ} and its arguments \code{rANCSEQ.args}
PANGEA.RootSeq.create.sampler<- function(root.ctime.grace= 0.5, sample.grace= 3)
{
file <- system.file(package="PANGEA.HIV.sim", "misc",'PANGEA_SSAfgBwhRc-_140907_n390_AncSeq.R')
cat(paste('\nLoading starting sequences from file', file))
load(file) #expect "anc.seq.gag" "anc.seq.pol" "anc.seq.env" "anc.seq.info"
setkey(anc.seq.info, CALENDAR_TIME)
rANCSEQ.args <- list( root.ctime.grace=root.ctime.grace, sample.grace=sample.grace, anc.seq.info=anc.seq.info, anc.seq.gag=anc.seq.gag, anc.seq.pol=anc.seq.pol, anc.seq.env=anc.seq.env)
rANCSEQ<- function(root.ctime, rANCSEQ.args)
{
tmp <- lapply(seq_along(root.ctime), function(i)
{
tmp <- subset(rANCSEQ.args$anc.seq.info, CALENDAR_TIME>root.ctime[i]-rANCSEQ.args$root.ctime.grace & CALENDAR_TIME<=root.ctime[i]+rANCSEQ.args$root.ctime.grace )
if(nrow(tmp)<rANCSEQ.args$sample.grace*100)
{
warning( paste('\nFor root',i,': number of samples is n=',nrow(tmp),'. safe pool size is n=',rANCSEQ.args$sample.grace*100) )
}
data.table( LABEL= tmp[, sample( LABEL, rANCSEQ.args$sample.grace ) ], CALENDAR_TIME=root.ctime[i], DRAW=i )
})
tmp <- do.call('rbind',tmp)
# get unique seqs
setkey(tmp, LABEL)
tmp <- unique(tmp, by='LABEL')
anc.seq.draw <- do.call( 'cbind', list( rANCSEQ.args$anc.seq.gag[tmp[, LABEL], ], rANCSEQ.args$anc.seq.pol[tmp[, LABEL], ], rANCSEQ.args$anc.seq.env[tmp[, LABEL], ] ) )
anc.seq.draw <- seq.unique(anc.seq.draw)
# check if at least one seq for each draw
tmp <- merge( data.table(LABEL=rownames(anc.seq.draw)), tmp, by='LABEL' )
stopifnot( !length(setdiff( seq_along(root.ctime), tmp[, unique(DRAW)] )) )
# take first seq for each draw
tmp <- tmp[, list(LABEL= LABEL[1]), by='DRAW']
setkey(tmp, DRAW)
anc.seq.draw <- anc.seq.draw[ tmp[, LABEL], ]
anc.seq.draw
}
list(rANCSEQ=rANCSEQ, rANCSEQ.args=rANCSEQ.args)
}
##--------------------------------------------------------------------------------------------------------
# return evolutionary rate sampler for transmission edges
# olli originally written 16-09-2014
##--------------------------------------------------------------------------------------------------------
#' @title Create sampler of Evolutionary Rates for transmission edges
#' @description Returns a function to draw evolutionary rates for transmission edges. Currently modelled with a log normal density.
#' @param er.shift shift to mean of the log normal density
#' @return R function
PANGEA.TransmissionEdgeEvolutionaryRate.create.sampler<- function(er.meanlog, er.sdlog)
{
if(0)
{
x <- seq(0.0005, 0.003, 0.00001)
tmp <- data.table(x=x, y1=dLOGNO(x, mu=-6.714086, sigma=0.15), y2=dLOGNO(x, mu=-6.714086-0.15^2/2+0.075^2/2, sigma=0.075), y4=dLOGNO(x, mu=-6.714086-0.15^2/2+0.0375^2/2, sigma=0.0375))
tmp <- melt(tmp, id.var='x')
ggplot(tmp, aes(x=x, y=value, group=variable, colour=variable)) + geom_line() + scale_x_continuous(breaks=seq(0,0.003,0.0005))
x <- seq(0.0005, 0.003, 0.00001)
tmp <- data.table(x=x, y1=dLOGNO(x, mu=log(0.002239075)-0.05^2/2, sigma=0.05), y5=dLOGNO(x, mu=log(0.002239075-0.0005)-0.065^2/2, sigma=0.065), y10=dLOGNO(x, mu=log(0.002239075-0.001)-0.09^2/2, sigma=0.09))
x <- seq(0.0005, 0.01, 0.00001)
tmp <- data.table( x=x,
y5=dLOGNO(x, mu=log(0.002239075)-0.05^2/2, sigma=0.05), y7=dLOGNO(x, mu=log(0.002239075)-0.07^2/2, sigma=0.07), y10=dLOGNO(x, mu=log(0.002239075)-0.1^2/2, sigma=0.1), y13=dLOGNO(x, mu=log(0.002239075)-0.13^2/2, sigma=0.13),
W441=dLOGNO(x, mu=log(0.00447743)-0.3^2/2, sigma=0.3), W442=dLOGNO(x, mu=log(0.00447743)-0.4^2/2, sigma=0.4), W443=dLOGNO(x, mu=log(0.00447743)-0.5^2/2, sigma=0.5))
# times 2
x <- seq(0.0005, 0.01, 0.00001)
tmp <- data.table( x=x,
#TransmissionLineage2=dLOGNO(x, mu=log(0.002239075)-0.13^2/2, sigma=0.13),
TransmissionLineage=dLOGNO(x, mu=log(0.002239075)-0.3^2/2, sigma=0.3),
#TransmissionLineage3=dLOGNO(x, mu=log(0.002239075)-0.2^2/2, sigma=0.2),
TipBranch=dLOGNO(x, mu=log(0.00447743)-0.5^2/2, sigma=0.5)
)
# times 1.5
#tmp <- data.table( x=x,
# TransmissionLineage=dLOGNO(x, mu=log(0.002239075)-0.13^2/2, sigma=0.13),
# TipBranch=dLOGNO(x, mu=log(0.003358613)-0.3^2/2, sigma=0.3))
#tmp <- data.table( x=x,
# TransmissionLineage=dLOGNO(x, mu=log(0.002239075)-0.01^2/2, sigma=0.01),
# TipBranch=dLOGNO(x, mu=log(0.003)-0.2^2/2, sigma=0.2))
tmp <- melt(tmp, id.var='x')
set(tmp, NULL, 'variable',tmp[,factor(variable,levels=c('TransmissionLineage','TipBranch'),labels=c('Transmission lineage','Non-transmission lineage'))])
ggplot(tmp, aes(x=x, y=value, group=variable, colour=variable)) + geom_line() +
scale_x_continuous(breaks=seq(0,0.02,0.001)) +
theme(legend.position='bottom') +
labs(x='\nevolutionary rate',y='',colour='')
ggsave(file='~/git/PANGEA.HIV.sim/man/fig_ERmodel.png', width=8, height=5)
}
if(0)
{
file <- system.file(package="PANGEA.HIV.sim", "misc",'PANGEA_SSAfgBwhRc-_140907_n390_BEASTlog.R')
cat(paste('\nreading GTR parameters from file',file))
load(file) # expect log.df
# exclude odd BEAST runs
log.df <- subset(log.df, !(GENE=='ENV' & FILE=='pool3'))
# exclude cols
log.df[, ucld.mean:=NULL]
log.df[, ucld.stdev:=NULL]
log.df[, coefficientOfVariation:=NULL]
log.df[, treeModel.rootHeight:=NULL]
# get posterior distribution of overall meanRate across genes
tmp <- log.df[, list(meanRate=mean(meanRate)), by='GENE'][, mean(meanRate)]
log.df <- log.df[, list(meanRate=meanRate/mean(meanRate)*tmp), by='GENE']
#ggplot(log.df, aes(x=meanRate)) + geom_histogram() + facet_grid(.~GENE, scales='free')
# get lognormal density parameters
sd.log <- log.df[, sd(log(meanRate))] # 0.1499262
mean.log<- log.df[, mean(log(meanRate))] # -6.113092; 0.002239075
# mean is exp( mean.log+sd.log*sd.log/2 ); median is exp(mean.log)
}
rERbw.args <- list(meanlog=er.meanlog, sdlog=er.sdlog)
if(er.sdlog>0)
{
rERbw <- function(n, rERbw.args)
{
rlnorm(n, meanlog=rERbw.args$meanlog, sdlog=rERbw.args$sdlog)
}
}
if(er.sdlog==0)
{
rERbw <- function(n, rERbw.args)
{
rep( exp(rERbw.args$meanlog), n)
}
}
list(rERbw=rERbw, rERbw.args=rERbw.args)
}
##--------------------------------------------------------------------------------------------------------
# Function to add gaps into sequences
# olli originally written 23-06-2015
##--------------------------------------------------------------------------------------------------------
PANGEA.align.from.basefreq<- function(indir.refalgn, infile.refalgn, indir.basefreq, callconsensus.minc, outdir, verbose=1)
{
callconsensus.cmd <- 'python /Users/Oliver/git/SeqAnalTools/CallConsensus_splitbywhitespace.py'
#callconsensus.cmd <- 'python ~/Downloads/CallConsensus.py'
callconsensus.maxc <- callconsensus.minc
mergealignments.cmd <- 'python /Users/Oliver/git/SeqAnalTools/MergeAlignments2.0.py'
mergealignments.opt <- '-d'
# create consensus at coverage cut off
files <- data.table(FILE=list.files(indir.basefreq, pattern='txt$'))
invisible(files[,{
tmp <- gsub(' ','\\ ',gsub(')','\\)',gsub('(','\\(',paste(indir.basefreq, '/', FILE, sep=''),fixed=1),fixed=1),fixed=1)
cmd <- paste(callconsensus.cmd,' ',tmp,' ', callconsensus.minc, ' ', callconsensus.maxc, sep='')
#cat(cmd)
ans <- system(cmd, intern=TRUE)
ans <- gsub('.BaseFreqs.ic','',gsub('ContigsFlattenedWith_','',ans, fixed=TRUE), fixed=TRUE)
file <- paste('TMP_',gsub('.BaseFreqs.ic.txt',paste('_consensus_C',callconsensus.minc,'.fa',sep=''), FILE),sep='')
cat(paste(ans, collapse='\n'), file=paste(outdir, '/', file, sep=''))
NULL
}, by='FILE'])
# add to reference alignment
files <- data.table(FILE=list.files(outdir, pattern=paste('consensus_C',callconsensus.minc,'.fa',sep='')))
ps <- files[,{
cmd <- paste(mergealignments.cmd,' ',mergealignments.opt,' "',indir.refalgn,'/',infile.refalgn,'" ',outdir,'/',FILE,sep='')
#cat(cmd)
ans <- system(cmd, intern=TRUE)
list(SEQ=paste(ans[-1], collapse=''))
}, by='FILE']
# convert consensus alignment to DNAbin
set(ps, NULL, 'FILE', ps[, gsub('TMP_','',FILE)])
set(ps, NULL, 'FILE', ps[, gsub('_consensus.*','',FILE)])
tmp <- tolower(do.call('rbind',strsplit(ps[, SEQ],'')))
rownames(tmp) <- ps[, FILE]
ps <- as.DNAbin(tmp)
# clean up
tmp <- list.files(outdir, pattern='^TMP_', full.names=TRUE)
invisible(file.remove(tmp))
ps
}
##--------------------------------------------------------------------------------------------------------
# Function to add gaps into sequences
# olli originally written 23-06-2015
##--------------------------------------------------------------------------------------------------------
PANGEA.add.gaps<- function(indir.simu, indir.gap, infile.simu, infile.gap, gap.country, gap.symbol, gap.seed, verbose=1)
{
# find files
files <- data.table(FILE=list.files(indir.simu, pattern='\\.fa.*$'))
files <- subset(files, !grepl('^TMP',FILE) & grepl(infile.simu, FILE, fixed=1))
if(files[, any(grepl('gag', FILE))])
{
files[, SIMU:=files[, regmatches(FILE,regexpr('TRAIN[0-9]', FILE))]]
files[, GENE:=files[, sapply(strsplit(FILE,'_'), '[[', 5)]]
set(files, NULL, 'GENE', toupper(files[, substr(GENE,1,nchar(GENE)-3)]))
set(files, NULL, 'GENE', files[, factor(GENE, levels=c('GAG','POL','ENV'), labels=c('GAG','POL','ENV'))])
setkey(files, SIMU, GENE)
files <- subset(files, grepl(infile.simu, FILE, fixed=1) & !is.na(GENE))
# concatenate simu seqs
files[, {
gag <- read.dna( paste(indir.simu, FILE[1], sep='/'), format='fasta' )
pol <- read.dna( paste(indir.simu, FILE[2], sep='/'), format='fasta' )
env <- read.dna( paste(indir.simu, FILE[3], sep='/'), format='fasta' )
tmp <- cbind(gag, pol, env)
write.dna( tmp, file=paste(indir.simu, '/', 'TMP', gsub('gag','conc',FILE[1]), sep=''), format='fasta', colsep='', nbcol=-1)
NULL
}, by='SIMU']
infile.simu <- paste('TMP', gsub('gag','conc',files[1,FILE]), sep='')
}
else
infile.simu <- files[1,FILE]
#
# add to fixed PANGEA alignment
#
ps <- read.dna(paste(indir.gap,'/',infile.gap,sep=''), format='fasta')
tmp <- ps[(nrow(ps)-20):nrow(ps),]
tmp <- as.character(tmp)
tmp[tmp=='?'] <- 'n' #need this for MAFFT
tmp <- as.DNAbin(tmp)
infile.old <- paste('TMP',gsub('\\.','_last20\\.',infile.gap),sep='')
write.dna(tmp, file=paste(indir.simu, infile.old, sep='/'), format='fasta', colsep='', nbcol=-1)
infile.old <- paste(indir.simu, infile.old, sep='/')
# mafft --thread 1 --treeout --reorder --keeplength --mapout --ep 0.0 --add new_sequences input > output
infile.new <- paste(indir.simu,'/',infile.simu, sep='')
file <- paste(indir.simu,'/','TMP', gsub('\\.','mergedpartial\\.',infile.simu), sep='')
cmd <- paste('mafft --thread 4 --treeout --reorder --keeplength --mapout --ep 0.0 --add ',infile.new,' ',infile.old,' > ',file, sep='')
cat('\ncalling')
cat(cmd)
system(cmd)
# remove missing taxa from PANGEA alignment
ps <- as.character(ps)
tmp <- apply(ps, 1, function(x) !all(x%in%c('-','?','n')) )
ps <- as.DNAbin( ps[tmp, ] )
# merge all
file <- list.files(indir.simu, pattern='mergedpartial')
stopifnot(length(file)==1)
tmp <- read.dna(paste(indir.simu, file, sep='/'), format='fasta')
tmp <- tmp[grepl('IDPOP|HOUSE', rownames(tmp)),]
psm <- rbind(ps, tmp)
#
# randomly select gaps from sequences in infile.gap from non-missing sequences
#
set.seed(gap.seed)
x <- as.character(psm)
tmp <- which(grepl(gap.country,rownames(x)))
gp.df <- data.table(IDXSIM=which(grepl('IDPOP|HOUSE',rownames(x))))
gp.df[, IDXGAP:=sample(tmp, nrow(gp.df), replace=TRUE)]
for(i in seq_len(nrow(gp.df)))
{
z <- which(x[ gp.df[i, IDXGAP], ]==gap.symbol)
x[gp.df[i, IDXSIM],z] <- gap.symbol
}
x <- x[ grepl('IDPOP|HOUSE', rownames(x)), ]
# remove leading and trailing gap rows
tmp <- which(apply(x, 2, function(z) !all(z%in%c('-','?','n'))))
sgp <- as.DNAbin(x[, seq.int(tmp[1], tmp[length(tmp)])])
cat(paste('\nSeq length without leading/trailing gaps and with seq coverage (no first/last common "?","-"), n=', ncol(sgp)))
# clean up
tmp <- list.files(indir.simu, pattern='^TMP', full.names=TRUE)
file.remove(tmp)
sgp
}
##--------------------------------------------------------------------------------------------------------
# Function to add gaps into sequences
# olli originally written 01-07-2015
##--------------------------------------------------------------------------------------------------------
PANGEA.add.gaps.merge.and.maintain.triplets<- function(indir.simu, indir.gap, infile.simu, infile.gap, prefix.simulation='TRAIN', verbose=1)
{
# find files
files <- data.table(FILE=list.files(indir.simu, pattern='\\.fa.*$'))
files <- subset(files, !grepl('RMGPS',FILE) & !grepl('map$',FILE) & !grepl('^TMP',FILE) & grepl(infile.simu, FILE, fixed=1))
if(files[, any(grepl('gag', FILE))])
{
files[, SIMU:=files[, regmatches(FILE,regexpr(paste(prefix.simulation,'[0-9]',sep=''), FILE))]]
files[, GENE:=files[, sapply(strsplit(FILE,'_'), '[[', 5)]]
set(files, NULL, 'GENE', toupper(files[, substr(GENE,1,nchar(GENE)-3)]))
set(files, NULL, 'GENE', files[, factor(GENE, levels=c('GAG','POL','ENV'), labels=c('GAG','POL','ENV'))])
setkey(files, SIMU, GENE)
files <- subset(files, grepl(infile.simu, FILE, fixed=1) & !is.na(GENE))
# concatenate simu seqs
files[, {
gag <- read.dna( paste(indir.simu, FILE[1], sep='/'), format='fasta' )
pol <- read.dna( paste(indir.simu, FILE[2], sep='/'), format='fasta' )
env <- read.dna( paste(indir.simu, FILE[3], sep='/'), format='fasta' )
tmp <- cbind(gag, pol, env)
write.dna( tmp, file=paste(indir.simu, '/', 'TMP', gsub('gag','conc',FILE[1]), sep=''), format='fasta', colsep='', nbcol=-1)
NULL
}, by='SIMU']
infile.simu <- paste('TMP', gsub('gag','conc',files[1,FILE]), sep='')
}
else
infile.simu <- files[1,FILE]
#
# add to fixed PANGEA alignment
#
ps <- read.dna(paste(indir.gap,'/',infile.gap,sep=''), format='fasta')
tmp <- ps[(nrow(ps)-20):nrow(ps),]
tmp <- as.character(tmp)
tmp[tmp=='?'] <- 'n' #need this for MAFFT
tmp <- as.DNAbin(tmp)
infile.old <- paste('TMP',gsub('\\.','_last20\\.',infile.gap),sep='')
write.dna(tmp, file=paste(indir.simu, infile.old, sep='/'), format='fasta', colsep='', nbcol=-1)
infile.old <- paste(indir.simu, infile.old, sep='/')
# mafft --thread 1 --treeout --reorder --keeplength --mapout --ep 0.0 --add new_sequences input > output
infile.new <- paste(indir.simu,'/',infile.simu, sep='')
file <- paste(indir.simu,'/','TMP', gsub('\\.','mergedpartial\\.',infile.simu), sep='')
# do not force length: mafft --reorder --anysymbol --add new_sequences --auto input
cmd <- paste('unset MAFFT_BINARIES\nmafft --thread 4 --treeout --reorder --keeplength --mapout --ep 0.0 --add ',infile.new,' ',infile.old,' > ',file, sep='')
cat('\ncalling')
cat(cmd)
system(cmd)
# remove missing taxa from PANGEA alignment
ps <- as.character(ps)
tmp <- apply(ps, 1, function(x) !all(x%in%c('-','?','n')) )
ps <- as.DNAbin( ps[tmp, ] )
# merge all
file <- list.files(indir.simu, pattern='mergedpartial')
stopifnot(length(file)==1)
tmp <- read.dna(paste(indir.simu, file, sep='/'), format='fasta')
tmp <- tmp[grepl('IDPOP|HOUSE', rownames(tmp)),]
psm <- rbind(ps, tmp)
x <- as.character(psm)
# remove leading and trailing gap rows
tmp <- which(apply(x[ grepl('IDPOP|HOUSE', rownames(x)), ], 2, function(z) !all(z%in%c('-','?','n'))))
cat(paste('\nfirst non-gap',tmp[1],'last non-gap',tmp[length(tmp)]))
x <- x[, seq.int(tmp[1], tmp[length(tmp)])]
# rm gap columns that are in the PANGEA alignment and in the simulations
# (should not be in PANGEA alignment)
tmp <- which(apply(x, 2, function(z) all(z%in%c('-','?','n'))))
cat(paste('\nrm common gap columns, n=',length(tmp)))
if(length(tmp))
x <- x[,-tmp]
# check that gaps are only added in triples of 3
# not true. curating alignment manually: can explain all 0110 discrepancies.
# remove columns that break the codon structure
#exp.nontriplegaps<- c(388, 1476)
#tmp <- paste(as.numeric(x[nrow(x),]=='-'),collapse='')
#tmp <- gregexpr('0110',tmp)[[1]] + 1
#cat(paste('\nstarting positions of non-triplet gaps',paste(tmp, collapse=', ')))
#cat(paste('\nexpect starting positions of non-triplet gaps',paste(exp.nontriplegaps, collapse=', ')))
#stopifnot( all(tmp==exp.nontriplegaps) )
#x <- x[,-c(tmp, tmp+1)]
# check that gaps are only added in triples of 3
xx <- paste(as.numeric(x[nrow(x),]=='-'),collapse='')
dfg <- data.table(STRT=gregexpr('01',xx)[[1]] + 1) #gap start positions
tmp <- strsplit(xx,'0')[[1]]
dfg[, LEN:=sapply( tmp[tmp!=""], nchar )]
dfg[, CUT:= LEN%%3]
dfg <- subset(dfg, CUT>0)
if(nrow(dfg))
{
tmp <- unlist(lapply( seq_len(nrow(dfg)), function(i) seq.int(from=dfg[i,STRT], length.out=dfg[i,CUT])))
cat(paste('\npositions of large non-triplet gaps that are removed',paste(tmp, collapse=', ')))
x <- x[,-tmp]
}
x <- as.DNAbin(x)
x
}
PANGEA.add.gaps.simulate<- function(x, gap.country, gap.symbol, gap.seed, prefix.simu='IDPOP|HOUSE', with.hxb2=1, strip.gaps=1, verbose=1)
{
stopifnot(any(class(x)=='DNAbin'))
x <- as.character(x)
#
# randomly select gaps from sequences in infile.gap from non-missing sequences
#
set.seed(gap.seed)
tmp <- which(grepl(gap.country,rownames(x)))
dfg <- data.table(IDXSIM=which(grepl(prefix.simu,rownames(x))))
dfg[, IDXGAP:=sample(tmp, nrow(dfg), replace=TRUE)]
for(i in seq_len(nrow(dfg)))
{
z <- which(x[ dfg[i, IDXGAP], ]==gap.symbol)
x[dfg[i, IDXSIM],z] <- gap.symbol
}
if(!with.hxb2)
x <- x[ grepl(prefix.simu, rownames(x)), ]
if(with.hxb2)
x <- x[ grepl(prefix.simu, rownames(x)) | grepl('HXB2', rownames(x)), ]
if(strip.gaps)
{
tmp <- which(apply(x[!grepl('HXB2', rownames(x)),], 2, function(z) all(z%in%c('-','?','n'))))
cat(paste('\nrm common gap columns, n=',length(tmp)))
if(length(tmp))
x <- x[,-tmp]
}
sgp <- as.DNAbin(x)
sgp
}
##--------------------------------------------------------------------------------------------------------
PANGEA.add.gaps.allocate.chunks.to.sequences<- function(ch, ms)
{
chs <- copy(ch)
chs[, IDX:= sample(nrow(chs))]
setkey(chs, IDX)
dfg <- data.table(IDXSIM= which(grepl('IDPOP|HOUSE',rownames(ms))))
# preallocate: sample gap chunks at random onto simulated sequences
chs[, COPY_TO:= sample(dfg[, IDXSIM], nrow(chs), replace=TRUE)]
# for every new added chunk that could be added, calculate the resulting proportion of gaps in the alignment
chs[, IDX_COARSE:= ceiling(IDX/1e2)]
tmp <- chs[, {
#IDX<- 2000
z <- IDX_COARSE
z <- subset(chs, IDX_COARSE<=z)
setkey(z, COPY_TO, POS)
# merge overlapping gaps
ans <- z[, {
tmp <- which( POS_NEXT[-length(POS_NEXT)]>POS[-1] )[1]
#print(COPY_TO); print(POS_NEXT[-length(POS_NEXT)]); print(POS[-1]); print(tmp)
while(!is.na(tmp))
{
POS_NEXT[tmp] <- apply(rbind(POS_NEXT[tmp],POS_NEXT[tmp+1L]), 2, max)
POS_NEXT[tmp+1L]<- POS_NEXT[tmp]
POS[tmp+1L] <- POS[tmp]
#print(POS); print(POS_NEXT)
POS <- POS[-tmp]
POS_NEXT <- POS_NEXT[-tmp]
tmp <- which( POS_NEXT[-length(POS_NEXT)]>POS[-1] )[1]
}
#print(POS); print(POS_NEXT)
stopifnot(length(POS)==length(POS_NEXT))
list(POS_START=POS, POS_END=POS_NEXT-1L)
}, by='COPY_TO']
ans <- ans[, sum(POS_END-POS_START)] / (nrow(dfg) * ncol(ms))
list(GAP_P=ans)
}, by='IDX_COARSE']
tmp
chs <- merge(chs, tmp, by='IDX_COARSE')
chs
}
##--------------------------------------------------------------------------------------------------------
# return evolutionary rate modifier sampler for transmission edges
# olli originally written 21-08-2014
##--------------------------------------------------------------------------------------------------------
PANGEA.BetweenHostEvolutionaryRateModifier.create.sampler.v1<- function(bwerm.mu=1.5, bwerm.sigma=0.12)
{
if(0)
{
#from Vrancken et al:
#c(3.5/2.5, 7.0/4.2) #1.4, 1.67 draw this from lognormal with mean 1.5 and variance so that tail captures 1.8
x <- seq(1.01, 15, 0.01)
tmp <- data.table(x=x, y5=dLOGNO(x, mu=log(1.5), sigma=0.12), y8=dLOGNO(x, mu=log(1.75), sigma=0.103), y2=dLOGNO(x, mu=log(2), sigma=0.09))
tmp <- data.table(x=x, y4=dLOGNO(x, mu=log(4), sigma=0.045), y3=dLOGNO(x, mu=log(3), sigma=0.06), y2=dLOGNO(x, mu=log(2), sigma=0.09))
tmp <- data.table(x=x, y4=dLOGNO(x, mu=log(4), sigma=0.06), y3=dLOGNO(x, mu=log(3), sigma=0.06), y2=dLOGNO(x, mu=log(2), sigma=0.06))
tmp <- data.table(x=x, y6=dLOGNO(x, mu=log(6)+0.4^2/2, sigma=0.4), y5=dLOGNO(x, mu=log(5)+0.4^2/2, sigma=0.4), y4=dLOGNO(x, mu=log(4)+0.4^2/2, sigma=0.4), y3=dLOGNO(x, mu=log(3)+0.4^2/2, sigma=0.4))
tmp <- melt(tmp, id.var='x')
ggplot(tmp, aes(x=x, y=value, group=variable, colour=variable)) + geom_line() + scale_x_log10(breaks=seq(1,20,1))
}
rER.bwm<- function(n)
{
rLOGNO(n, mu=log(bwerm.mu), sigma=bwerm.sigma)
}
rER.bwm
}
##--------------------------------------------------------------------------------------------------------
# return within host evolutionary rate sampler
# olli originally written 21-08-2014
##--------------------------------------------------------------------------------------------------------
#' @title Create sampler of Within Host Evolutionary Rates
#' @description Returns a function to draw within host evolutionary rates. Currently modelled with a log normal density.
#' @param wher.mu mean of the log normal density
#' @param wher.sigma standard deviation of the log normal density
#' @return R function
PANGEA.WithinHostEvolutionaryRate.create.sampler.v1<- function(wher.mu=log(0.005), wher.sigma=0.8)
{
if(0)
{
#extremely basic model of within host evolutionary rate from HIV-1B pol estimates in the literature
#median log10 ER of pol in Alizon / Fraser
df.er <- data.table(ER= 10^c(-1.85, -2.2, -2.5, -2.7, -2.72, -3.2), GENE='POL')
tmp <- gamlss(ER~1, data=df.er, family=LOGNO)
x <- seq(0.0005, 0.02, 0.0001)
tmp <- data.table(x=x, y5=dLOGNO(x, mu=log(0.005), sigma=0.8), y4=dLOGNO(x, mu=log(0.004), sigma=0.7), y3=dLOGNO(x, mu=log(0.003), sigma=0.6))
tmp <- data.table(x=x, y83=dLOGNO(x, mu=-4.784295+0.045, sigma=0.3), y62=dLOGNO(x, mu=-5.071977+0.08, sigma=0.4), y41=dLOGNO(x, mu=-5.477443+0.18, sigma=0.6))
# should have been '-' all along:
# mean(rLOGNO(1e4, mu=-5.071977+0.4^2/2, sigma=0.4))
tmp <- data.table(x=x, y62=dLOGNO(x, mu=-5.071977+0.08, sigma=0.4), y62b=dLOGNO(x, mu=-5.071977-0.4^2/2, sigma=0.4) )
tmp <- data.table(x=x, y62=dLOGNO(x, mu=log(0.006716145)-0.37^2/2, sigma=0.37), y621=dLOGNO(x, mu=-5.071977-0.2^2/2, sigma=0.2), y441=dLOGNO(x, mu=log(0.00447743)-0.3^2/2, sigma=0.3) )
tmp <- melt(tmp, id.var='x')
ggplot(tmp, aes(x=x, y=value, group=variable, colour=variable)) + geom_line() + scale_x_continuous(breaks=seq(0,0.02,0.002))
#correlation model: need high multiplier if ER high
require(compositions)
y <- rlnorm.rplus(1e4, meanlog=c(-5.071977+0.08, log(3)+0.4^2/2), varlog=diag(c(0.6,0.4))%*%matrix(c(1,0.95,0.95,1),2,2)%*%diag(c(0.6,0.4)) )
tmp <- data.table(wh=y[,1], bm=y[,2])
ggplot(tmp, aes(x= wh, y=bm)) + geom_point()
#explore marginal ER along branches
tmp <- do.call('rbind',lapply(c(3,4,5,6), function(bm)
{
tmp <- rlnorm.rplus(1e4, meanlog=c(-5.071977+0.08, log(bm)+0.4^2/2), varlog=diag(c(0.6,0.4))%*%matrix(c(1,0.95,0.95,1),2,2)%*%diag(c(0.6,0.4)) )
data.table(value=tmp[,1]/tmp[,2], variable=paste('y',bm,sep=''))
}))
ggplot(tmp, aes(x=value, fill=variable)) + geom_histogram(binwidth=0.00025) + facet_grid(.~variable, scales='free')
}
if(wher.sigma>0)
{
rER.pol<- function(n)
{
ans <- numeric(0)
while(length(ans)<n)
{
tmp <- rLOGNO(2*n, mu=wher.mu, sigma=wher.sigma)
tmp[which(tmp>0.02)] <- NA
ans <- c(ans, na.omit(tmp))
}
ans[seq_len(n)]
}
}
if(wher.sigma==0)
{
rER.pol<- function(n)
{
rep(exp(wher.mu), n)
}
}
rER.pol
}
######################################################################################
# return GAG POL ENV ancestral sequences from BEAST PARSER output
# olli originally written 06-08-2014
# tree beast trees in ape format, needed to compute calendar time for each ancestral sequence
# node.stat data.table containing meta information in nexus file for nodes
# bseq data.table containing original sequences. only needed for BEAST decompression.
# return list of GAG POL ENV sequences in ape format
PANGEA.RootSeqSim.get.ancestral.seq.withDecompression<- function(tree, node.stat, bseq, tree.id.sep='_', tree.id.idx.mcmcit=2, tree.id.burnin=1, label.sep='|', label.idx.ctime=2)
{
tree.id <- names(tree)
# add calendar time for inner node at NODE_ID to node.stat
node.stat[, CALENDAR_TIME:=NA_real_]
setkey(node.stat, TREE_ID, NODE_ID)
for(i in seq_along(tree.id))
{
label.ctime <- sapply( strsplit(tree[[i]]$tip.label, label.sep, fixed=TRUE), '[[', label.idx.ctime )
label.ctime <- as.numeric(label.ctime)
depth <- node.depth.edgelength( tree[[ i ]] )
tmp <- which.max(depth)
depth <- depth-depth[tmp]+label.ctime[tmp]
for(j in seq_along(depth))
set(node.stat, node.stat[, which(TREE_ID==tree.id[i] & NODE_ID==j)], 'CALENDAR_TIME', depth[j])
}
tmp <- node.stat[, length(which(is.na(CALENDAR_TIME)))]
cat(paste('\nTotal node statistics with no CALENDAR_TIME [should be zero], n=', tmp ))
# keep only inner nodes
tmp <- lapply(seq_along(tree.id), function(i)
{
subset(node.stat, TREE_ID==tree.id[i] & NODE_ID>Ntip(tree[[i]]))
})
node.stat <- do.call('rbind',tmp)
set(node.stat, NULL, 'VALUE', node.stat[, gsub('\"','',VALUE)])
#
# reconstruct ancestral sequences, need to decompress patterns that were compressed with BEAST
# TODO ** this results in duplicate columns and should be removed at a later point **
#
bseq <- merge(bseq, bseq[, list(SEQ_N=nchar(SEQ)), by=c('GENE','TAXON_ID')], by=c('GENE','TAXON_ID'))
bseq <- bseq[, {
tmp<- unlist(strsplit(SEQ,''))
list( CP1= paste(tmp[seq.int(1,length(tmp),by=3)], collapse='',sep=''),
CP2= paste(tmp[seq.int(2,length(tmp),by=3)], collapse='',sep=''),
CP3= paste(tmp[seq.int(3,length(tmp),by=3)], collapse='',sep='') )
}, by=c('TAXON_ID','GENE')]
bseq <- melt(bseq, measure.var=c('CP1','CP2','CP3'), variable.name='CODON_POS', value.name='SEQ')
# get index of orginal patterns and duplicate patterns
bseq.decompress <- bseq[, {
#print(GENE)
#print(CODON_POS)
tmp <- do.call('rbind',strsplit(SEQ,''))
tmp2 <- apply( tmp, 2, function(x) paste(x,sep='',collapse='')) #identical patterns?
tmp2 <- which(duplicated(tmp2))
#for each duplicate, work out index of original
tmp3 <- sapply(tmp2, function(j1) which(apply( tmp[,seq.int(1,j1-1,1), drop=FALSE] == tmp[,j1], 2, all))[1] )
list(NSEQ=ncol(tmp), DUPLICATE_PATTERN=tmp2, MOTHER_PATTERN=tmp3)
}, by=c('GENE','CODON_POS')]
set(bseq.decompress, bseq.decompress[, which(GENE=='env')], 'GENE', 'ENV')
set(bseq.decompress, bseq.decompress[, which(GENE=='gag')], 'GENE', 'GAG')
set(bseq.decompress, bseq.decompress[, which(GENE=='pol')], 'GENE', 'POL')
set(bseq.decompress, NULL, 'xSTAT', bseq.decompress[, paste(GENE,CODON_POS,sep='.')])
# reconstruct ancestral genes sequences - decompress patterns
ancseq <- node.stat[, {
#print(STAT)
#TREE_ID<- 'STATE_0'
#STAT<- 'GAG.CP3'
tmp <- subset(bseq.decompress, xSTAT==STAT)
seq <- matrix(data=NA_character_, nrow=length(VALUE), ncol=tmp[,NSEQ])
seq.compressed <- setdiff( seq_len(ncol(seq)), tmp[, DUPLICATE_PATTERN])
#print(dim(seq))
tmp2 <- do.call('rbind',strsplit(VALUE,''))
#print(dim(tmp2))
stopifnot(length(seq.compressed)==ncol(tmp2))
seq[, seq.compressed] <- tmp2
#print(seq[1,])
seq[, tmp[, DUPLICATE_PATTERN]] <- seq[, tmp[, MOTHER_PATTERN]]
#print(seq[1,])
#stop()
seq <- apply(seq, 1, function(x) paste(x, sep='',collapse=''))
list(TREE_ID=TREE_ID, NODE_ID=NODE_ID, CALENDAR_TIME=CALENDAR_TIME, SEQ=seq)
}, by=c('STAT')]
# checks of ancseq before we proceed
tmp <- ancseq[, list(NSEQ= nchar(SEQ)), by=c('TREE_ID', 'NODE_ID', 'STAT')]
stopifnot( tmp[, list(CHECK= all(NSEQ==NSEQ[1])), by='STAT'][, all(CHECK)] )
set(tmp, NULL, 'GENE', tmp[, sapply(strsplit(STAT,'\\.'),'[[',1)])
set(tmp, NULL, 'CODON_POS', tmp[, sapply(strsplit(STAT,'\\.'),'[[',2)])
stopifnot( tmp[, list(CHECK=all(NSEQ==NSEQ[1])), by='GENE'][, all(CHECK)] )
# reconstruct genes from codon positions
ancseq <- dcast.data.table(ancseq, TREE_ID + NODE_ID + CALENDAR_TIME ~ STAT, value.var="SEQ")
ancseq <- ancseq[, {
tmp <- do.call('rbind',sapply(list(ENV.CP1,ENV.CP2,ENV.CP3), strsplit, ''))
env <- paste(as.vector(tmp), collapse='')
tmp <- do.call('rbind',sapply(list(GAG.CP1,GAG.CP2,GAG.CP3), strsplit, ''))
gag <- paste(as.vector(tmp), collapse='')
tmp <- do.call('rbind',sapply(list(POL.CP1,POL.CP2,POL.CP3), strsplit, ''))
pol <- paste(as.vector(tmp), collapse='')
list(GAG=gag, POL=pol, ENV=env, CALENDAR_TIME=CALENDAR_TIME)
}, by=c('TREE_ID','NODE_ID')]
set(ancseq, NULL, 'LABEL', ancseq[, paste(TREE_ID, NODE_ID, round(CALENDAR_TIME,d=3), sep='|')])
# remove tree id STATE_xx where xx is smaller than burn-in
set(ancseq, NULL, 'BEAST_MCMC_IT', ancseq[, as.integer(sapply(strsplit(TREE_ID,tree.id.sep),'[[',tree.id.idx.mcmcit))])
ancseq <- subset(ancseq, BEAST_MCMC_IT>tree.id.burnin)
#
# return DNAbin
#
ancseq.gag <- tolower(do.call('rbind',strsplit(ancseq[, GAG],'')))
rownames(ancseq.gag) <- ancseq[, LABEL]
ancseq.gag <- as.DNAbin(ancseq.gag)
ancseq.pol <- tolower(do.call('rbind',strsplit(ancseq[, POL],'')))
rownames(ancseq.pol) <- ancseq[, LABEL]
ancseq.pol <- as.DNAbin(ancseq.pol)
ancseq.env <- tolower(do.call('rbind',strsplit(ancseq[, ENV],'')))
rownames(ancseq.env) <- ancseq[, LABEL]
ancseq.env <- as.DNAbin(ancseq.env)
#ancseq <- cbind(ancseq.gag, ancseq.pol, ancseq.env)
#
list(GAG=ancseq.gag, POL=ancseq.pol, ENV=ancseq.env)
}
######################################################################################
# return GAG POL ENV ancestral sequences from BEAST PARSER output
# olli originally written 09-09-2014
# tree beast trees in ape format, needed to compute calendar time for each ancestral sequence
# node.stat data.table containing meta information in nexus file for nodes
# return list of GAG POL ENV sequences in ape format
PANGEA.RootSeqSim.get.ancestral.seq.pg<- function(tree, node.stat, tree.id.sep='_', tree.id.idx.mcmcit=2, tree.id.burnin=1, label.sep='|', label.idx.ctime=5)
{
tree.id <- names(tree)
# add calendar time for inner node at NODE_ID to node.stat
node.stat[, CALENDAR_TIME:=NA_real_]
setkey(node.stat, TREE_ID, NODE_ID)
for(i in seq_along(tree.id))
{
cat(paste('\nProcess CALENDAR_TIME for tree id', tree.id[i] ))
label.ctime <- sapply( strsplit(tree[[i]]$tip.label, label.sep, fixed=TRUE), '[[', label.idx.ctime )
label.ctime <- as.numeric(label.ctime)
depth <- node.depth.edgelength( tree[[ i ]] )
tmp <- which.max(depth)
depth <- depth-depth[tmp]+label.ctime[tmp]
tmp <- node.stat[, which(TREE_ID==tree.id[i])]
for(j in seq_along(depth))
{
set(node.stat, tmp[ node.stat[tmp, which(NODE_ID==j)] ], 'CALENDAR_TIME', depth[j])
}
}
tmp <- node.stat[, length(which(is.na(CALENDAR_TIME)))]
cat(paste('\nTotal node statistics with no CALENDAR_TIME [should be zero], n=', tmp ))
# keep only inner nodes
tmp <- sapply(tree, Ntip)
stopifnot(all(tmp==tmp[1]))
node.stat <- subset(node.stat, NODE_ID>tmp[1])
#
set(node.stat, NULL, 'VALUE', node.stat[, gsub('\"','',VALUE)])
# checks of ancseq before we proceed
tmp <- node.stat[, list(NSEQ= nchar(VALUE)), by=c('TREE_ID', 'NODE_ID', 'STAT')]
stopifnot( tmp[, list(CHECK= all(NSEQ==NSEQ[1])), by='STAT'][, all(CHECK)] )
set(tmp, NULL, 'GENE', tmp[, sapply(strsplit(STAT,'\\.'),'[[',1)])
set(tmp, NULL, 'CODON_POS', tmp[, sapply(strsplit(STAT,'\\.'),'[[',2)])
tmp <- dcast.data.table(tmp, TREE_ID + NODE_ID + GENE ~ CODON_POS, value.var='NSEQ')
tmp <- tmp[, list(CPM=min(CP1, CP2, CP3)), by=c('TREE_ID','NODE_ID','GENE')]
stopifnot( tmp[, list(CHECK=all(CPM==CPM[1])), by='GENE'][, all(CHECK)] )
setkey(tmp, GENE)
# truncate to following size of coding regions (if necessary)
tmp <- unique(tmp, by='GENE')[, list(STAT=paste(GENE,'.CP',1:3,sep=''), CPM=CPM), by='GENE']
node.stat <- merge(node.stat, subset(tmp, select=c(STAT, CPM)), by='STAT')
set(node.stat, NULL, 'VALUE', node.stat[, substr(VALUE,1,CPM)])
set(node.stat, NULL, 'CPM', NULL)
set(node.stat, NULL, 'GENE', node.stat[, substr(STAT,1,nchar(STAT)-4)])
set(node.stat, NULL, 'STAT', node.stat[, substr(STAT,nchar(STAT)-2,nchar(STAT))])
# reconstruct genes from codon positions
node.stat <- dcast.data.table(node.stat, TREE_ID + NODE_ID + GENE + CALENDAR_TIME ~ STAT, value.var="VALUE")
node.stat <- node.stat[, {
tmp <- do.call('rbind',sapply(list(CP1,CP2,CP3), strsplit, ''))
tmp <- paste(as.vector(tmp), collapse='')
list(SEQ=tmp, GENE=GENE, CALENDAR_TIME=CALENDAR_TIME)
}, by=c('TREE_ID','NODE_ID')]
set(node.stat, NULL, 'LABEL', node.stat[, paste(TREE_ID, NODE_ID, round(CALENDAR_TIME,d=3), sep='|')])
# remove tree id STATE_xx where xx is smaller than burn-in
set(node.stat, NULL, 'BEAST_MCMC_IT', node.stat[, as.integer(sapply(strsplit(TREE_ID,tree.id.sep),'[[',tree.id.idx.mcmcit))])
node.stat <- subset(node.stat, BEAST_MCMC_IT>tree.id.burnin)
cat(paste('\nFound ancestral sequences, n=', nrow(node.stat) ))
tmp <- node.stat[, unique(GENE)]
stopifnot(length(tmp)==1)
node.stat
}
##--------------------------------------------------------------------------------------------------------
## program to generate files for Seq Gen from output of Matt s VirusTreeSimulator
## olli originally written 18-09-2014
## modified 17-01-2015
##--------------------------------------------------------------------------------------------------------
#' @export
PANGEA.SeqGen.createInputFile<- function(indir.epi, infile.epi, indir.vts, infile.prefix, outdir.sg, pipeline.args, verbose=1, with.plot=1, label.sep='|')
{
EPS <- 1e-12
stopifnot( all( c('s.seed','wher.mu','wher.sigma','bwerm.mu','bwerm.sigma')%in%pipeline.args[, stat] ) )
#
set.seed( pipeline.args['s.seed',][, as.numeric(v)] )
#
# setup samplers
#
file <- paste(indir.epi, '/', infile.epi, sep='')
load(file) #expect "df.epi" "df.trms" "df.inds" "df.sample"
#
cat(paste('\ncreate sampler of evolutionary rates'))
# create sampler of within host evolutionary rates
rER.pol <- PANGEA.WithinHostEvolutionaryRate.create.sampler.v1(wher.mu=pipeline.args['wher.mu',][, as.numeric(v)], wher.sigma=pipeline.args['wher.sigma',][, as.numeric(v)])
# create sampler of between host evolutionary rate dampener
tmp <- PANGEA.TransmissionEdgeEvolutionaryRate.create.sampler(er.meanlog=pipeline.args['bwerm.mu',][, as.numeric(v)], er.sdlog=pipeline.args['bwerm.sigma',][, as.numeric(v)])
rERbw <- tmp$rERbw
rERbw.args <- tmp$rERbw.args
# create sampler of ancestral sequences
cat(paste('\ncreate sampler of ancestral sequences'))
tmp <- subset( df.inds, !is.na(TIME_SEQ) )[, length(unique(IDCLU))]
cat(paste('\nnumber of clusters for which a sequence is required, n=', tmp))
tmp <- ifelse(tmp<400, 0.5, ifelse(tmp<500, 1, 2))
cat(paste('\nUsing root.ctime.grace=', tmp))
tmp <- PANGEA.RootSeq.create.sampler(root.ctime.grace= tmp, sample.grace= 3 )
rANCSEQ <- tmp$rANCSEQ
rANCSEQ.args <- tmp$rANCSEQ.args
# read GTR parameters
log.df <- PANGEA.GTR.params()
if(pipeline.args['dbg.rER',][, as.numeric(v)]==1 )
{
cat(paste('\nSetting mus to mean per gene and codon_pos'))
tmp <- log.df[, list(mu= mean(mu)), by=c('GENE','CODON_POS')]
#tmp[, ER:=mu*log.df$meanRate[1]]
log.df <- merge( subset(log.df, select=which(colnames(log.df)!='mu')), tmp, by=c('GENE','CODON_POS'))
}
if( pipeline.args['dbg.GTRparam',][, as.numeric(v)]==1 )
{
cat(paste('\nSetting GTR parameters to MEAN (except mu)'))
tmp <- mean
log.df <- log.df[, list(state=state, mu=mu, alpha=tmp(alpha), at=tmp(at), ac=tmp(ac), cg=tmp(cg), ag=tmp(ag), gt=tmp(gt), meanRate=tmp(meanRate), a=tmp(a), c=tmp(c), g=tmp(g), t=tmp(t) ), by=c('GENE','CODON_POS')]
}
log.df[, IDX:= seq_len(nrow(log.df))]
log.df[, FILE:=NULL]
#
#
#
infiles <- list.files(indir.vts)
tmp <- paste('^',infile.prefix,'.*nex$',sep='')
infiles <- sort(infiles[ grepl(tmp, infiles) ])
if(!length(infiles)) stop('cannot find files matching criteria')
#
# read from VirusTreeSimulator and convert branch lengths in time to branch lengths in subst/site
#
df.ph <- vector('list', length(infiles))
phd <- vector('list', length(infiles))
phs <- vector('list', length(infiles))
phd.plot <- vector('list', length(infiles))
df.nodestat <- vector('list', length(infiles))
cat(paste('\nUsing StartTimeMode',pipeline.args['index.starttime.mode',][,v]))
root.edge.rate <- NA
if( pipeline.args['index.starttime.mode',][,v]=='shift' )
{
root.edge.rate <- 1e-6
cat(paste('\nFix root edge rate to =',root.edge.rate))
}
if( as.numeric(pipeline.args['root.edge.fixed',][,v] ))
{
root.edge.rate <- log.df[1,meanRate]
cat(paste('\nFix root edge rate to =',root.edge.rate))
}
for(i in seq_along(infiles))
{
#i <- 40
#i<- 47; i<- 9
infile <- infiles[i]
cat(paste('\nprocess file',i,infile))
file <- paste(indir.vts, '/', infile, sep='')
# read brl, units from annotated nexus file. attention: () may not contain two nodes
tmp <- hivc.beast2out.read.nexus.and.stats(file, method.node.stat='any.node')
ph <- tmp$tree
node.stat <- tmp$node.stat
node.stat <- subset(node.stat, STAT=='Unit')
set(node.stat, NULL, 'VALUE', node.stat[, gsub('\"','',VALUE)])
node.stat[, IDPOP:= as.integer(node.stat[,substr(VALUE, 4, nchar(VALUE))])]
node.stat <- merge(subset(df.inds, select=c(IDPOP, GENDER, DOB, TIME_SEQ, IDCLU)), subset(node.stat, select=c(IDPOP, NODE_ID)), by='IDPOP')
# produce collapsed tree with branch length in units of calendar time
phd[[i]] <- seq.collapse.singles(ph)
#
# create collapsed Newick tree with expected substitutions / site for each branch
#
# draw within host evolutionary rates for every individual in the transmission chain, and smaller ERs along the transmission lineages
node.stat <- merge(node.stat, data.table( IDPOP=node.stat[, unique(IDPOP)], ER= rER.pol(node.stat[, length(unique(IDPOP))]), BWM= rERbw(node.stat[, length(unique(IDPOP))], rERbw.args) ), by='IDPOP')
# re-set to previous notation in terms of BWM ( between host multiplier to ER, ie ER= within-host ER / BWM )
set(node.stat, NULL, 'BWM', node.stat[, ER/BWM])
# set BWM to 1 for all edges that are NOT leading to a transmission.
# Because only one seq is sampled per patient, these are only edges that end in a tip.
set(node.stat, node.stat[, which( NODE_ID%in%seq.int(1,Ntip(ph)) )], 'BWM', 1.)
# no ER possible for root node - there s no edge leading to it
set(node.stat, node.stat[, which(NODE_ID==Ntip(ph)+1)], c('ER','BWM'), NA_real_)
# set root edge evolutionary rate to overall mean between-host rate
# get NODE_ID of edge from root
if(!is.na(root.edge.rate))
{
tmp <- ph$edge[match(Ntip(ph)+1, ph$edge[1, ]), 2]
tmp <- node.stat[, which(NODE_ID==tmp)]
set(node.stat, tmp, 'ER', root.edge.rate )
set(node.stat, tmp, 'BWM', 1. ) # no need to further slow down root edge
}
# check root edge length
if( pipeline.args['index.starttime.mode',][,v]=='fix45' )
{
stopifnot( ph$edge.length[ which( ph$edge[, 1] == Ntip(ph)+1 ) ]>=29.5 )
}
# check calendar time of root in simulated phylogeny for consistency
tmp <- seq.collapse.singles(ph)
tmp2 <- regmatches(tmp$tip.label[1], regexpr('ID_[0-9]+',tmp$tip.label[1]))
tmp2 <- as.numeric(substr(tmp2, 4, nchar(tmp2)))
tmp2 <- subset(node.stat, IDPOP==tmp2)[1, TIME_SEQ]
root.ctime <- ifelse(Nnode(tmp), tmp2 - (node.depth.edgelength(tmp)[1] + tmp$root.edge), tmp2-tmp$root.edge)
tmp <- subset(node.stat, IDPOP<0)[, unique(IDPOP)]
stopifnot(length(tmp)==1)
stopifnot(subset(df.trms, IDTR==tmp)[, round(IDTR_TIME_INFECTED, d=1)]==round(root.ctime, d=1))
# check if all sampling times are consistent with node height
tmp <- subset( node.stat, NODE_ID<=Ntip(ph) )
setkey(tmp, NODE_ID)
tmp2 <- seq.collapse.singles(ph)
if( Nnode(tmp2) )
tmp[, NODE_DEPTH:=root.ctime + tmp2$root.edge + node.depth.edgelength(tmp2)[ seq_len(Ntip(tmp2)) ] ]
if( Nnode(tmp2)==0 )
tmp[, NODE_DEPTH:=root.ctime + tmp2$root.edge ]
stopifnot( tmp[, max(abs(NODE_DEPTH-TIME_SEQ))<=1e-6 ] )
# set expected numbers of substitutions per branch within individual IDPOP
setkey(node.stat, NODE_ID)
ph$edge.length <- ph$edge.length * node.stat[ ph$edge[, 2], ][, ER / BWM]
stopifnot(all(!is.na(ph$edge.length)))
# once expected number of substitutions / site are simulated, can collapse singleton nodes
ph <- seq.collapse.singles(ph)
# set tip label so that IDPOP can be checked for consistency
if(pipeline.args['epi.model',][,v]=='HPTN071')
node.stat[, LABEL:= node.stat[, paste('IDPOP_',IDPOP,label.sep,GENDER,label.sep,'DOB_',round(DOB,d=3),label.sep,round(TIME_SEQ,d=3),sep='')]]
if(pipeline.args['epi.model',][,v]=='DSPS')
node.stat[, LABEL:= node.stat[, paste('IDPOP_',IDPOP,label.sep,GENDER,label.sep,'DOB_',NA,label.sep,round(TIME_SEQ,d=3),sep='')]]
setkey(node.stat, NODE_ID)
ph$tip.label <- node.stat[seq_len(Ntip(ph)), ][, LABEL]
phd[[i]]$tip.label <- node.stat[seq_len(Ntip(phd[[i]])), ][, LABEL]
phd.plot[[i]] <- seq.singleton2bifurcatingtree( phd[[i]] )
phs[[i]] <- seq.singleton2bifurcatingtree( ph )
#phd[[i]] <- seq.addrootnode( phd[[i]], dummy.label=paste('NOEXIST_IDCLU',node.stat[, unique(IDCLU)],'|NA|DOB_NA|',root.ctime,sep='') )
#
df.nodestat[[i]] <- node.stat
if(Nnode(ph))
{
tmp <- write.tree(ph, digits = 10)
tmp <- paste( '(',substr(tmp,1,nchar(tmp)-1),',NOEXIST_NA|NA|DOB_NA|',root.ctime,':0):0;', sep='')
phd[[i]] <- write.tree(phd[[i]], digits = 10)
}
if(!Nnode(ph))
{
tmp <- paste( '(',ph$tip.label,':',ph$root.edge,',NOEXIST_NA|NA|DOB_NA|',root.ctime,':0):0;', sep='')
phd[[i]] <- paste( '(',phd[[i]]$tip.label,':',phd[[i]]$root.edge, ');', sep='' )
}
df.ph[[i]] <- data.table(ROOT_CALENDAR_TIME= root.ctime, IDCLU=node.stat[, unique(IDCLU)], NEWICK=tmp)
#readline()
}
# write cluster trees of each transmission network into a single newick file
phd <- sapply(phd,'[[',1)
file <- paste(indir.epi, '/', substr(infile.epi,1,nchar(infile.epi)-6),'DATEDCLUTREES.newick', sep='')
cat(paste('\nWrite to file=',file))
writeLines(phd, con=file)
# get multifurcating tree with brl in units of calendar time
options(expressions=1e4)
phd <- eval(parse(text=paste('phd.plot[[',seq_along(phd.plot),']]', sep='',collapse='+')))
options(expressions=5e3)
phd <- drop.tip(phd, which(grepl('DUMMY', phd$tip.label)), root.edge=1)
phd <- ladderize(phd)
# write multifurcating tree with brl in units of calendar time
file <- paste(indir.epi, '/', substr(infile.epi,1,nchar(infile.epi)-6),'DATEDTREE.newick', sep='')
cat(paste('\nWrite to file=',file))
write.tree(phd, file=file)
# write multifurcating tree with brl in units of subst/site
options(expressions=1e4)
phs <- eval(parse(text=paste('phs[[',seq_along(phs),']]', sep='',collapse='+')))
options(expressions=5e3)
phs <- drop.tip(phs, which(grepl('DUMMY', phs$tip.label)), root.edge=1)
phs <- ladderize(phs)
file <- paste(indir.epi, '/', substr(infile.epi,1,nchar(infile.epi)-6),'SUBSTTREE.newick', sep='')
cat(paste('\nWrite to file=',file))
write.tree(phs, file=file)
#
df.ph <- do.call('rbind',df.ph)
df.nodestat <- do.call('rbind',df.nodestat)
#
#
# check that we have exactly one root edge with overall mean between host rate per cluster
if(!is.na(root.edge.rate))
stopifnot( df.nodestat[, length(unique(IDCLU))]==nrow(subset(df.nodestat, ER==root.edge.rate)) )
#tmp <- unique(subset(df.inds, IDPOP>=-110 & IDPOP<0, IDCLU))
#tmp <- merge(df.inds, tmp, by='IDCLU')[, length(which(!is.na(TIME_SEQ)))] / df.inds[, length(which(!is.na(TIME_SEQ)))]
#cat(paste('\nProportion of sequences descending from no import after baseline=', tmp))
#df.nodestat[, length(which(!is.na(TIME_SEQ))), by='IDCLU']
#subset(df.nodestat, IDCLU==72)
#subset(df.ph, IDCLU==72)
#subset(df.ph, IDCLU==47)
if(with.plot)
{
#subset(df.nodestat, ER!=root.edge.rate)[, table(BWM==1.)]
#ggplot(subset(df.nodestat, ER!=root.edge.rate) , aes(x=ER/BWM)) + geom_histogram(binwidth=0.001)
#ggplot(subset(df.nodestat, ER!=root.edge.rate & BWM!=1.) , aes(x=ER/BWM)) + geom_histogram(binwidth=0.0001)
#ggplot(subset(df.nodestat, ER!=root.edge.rate), aes(x=ER, y=BWM)) + geom_point()
# plot used within-host ERs
tmp <- root.edge.rate
if(is.na(tmp))
tmp <- Inf
ggplot(subset(df.nodestat, ER!=tmp), aes(x=ER/BWM)) + geom_histogram(binwidth=0.001) + labs(x='simulated within-host evolutionary rate') +
scale_x_continuous(breaks= seq(0, 0.02, 0.002))
file <- paste(indir.epi, '/', substr(infile.epi,1,nchar(infile.epi)-6),'INFO_sg_ER.pdf', sep='')
cat(paste('\nWrite to file=',file))
ggsave(file, w=6, h=6)
# plot used between host modifiers
ggplot(subset(df.nodestat, ER!=tmp & BWM==1) , aes(x=ER/BWM)) + geom_histogram(binwidth=0.001) + labs(x='simulated within-host rate evolutionary rate\nwithout transmission edges') +
scale_x_continuous(breaks= seq(0, 0.02, 0.002))
file <- paste(indir.epi, '/', substr(infile.epi,1,nchar(infile.epi)-6),'INFO_sg_BWM.pdf', sep='')
cat(paste('\nWrite to file=',file))
ggsave(file, w=6, h=6)
# plot used ERs along transmission edges
ggplot(subset(df.nodestat, ER!=tmp & BWM!=1) , aes(x=ER/BWM)) + geom_histogram(binwidth=0.0001) + labs(x='simulated evolutionary rates along transmission edges') +
scale_x_continuous(breaks= seq(0, 0.02, 0.0005))
file <- paste(indir.epi, '/', substr(infile.epi,1,nchar(infile.epi)-6),'INFO_sg_BWER.pdf', sep='')
cat(paste('\nWrite to file=',file))
ggsave(file, w=6, h=6)
if(grepl('fix',pipeline.args['index.starttime.mode',][,v]))
{
setkey(df.nodestat, IDPOP)
tmp <- merge(subset(unique(df.nodestat, by='IDPOP'), select=c(IDCLU, LABEL)), unique(df.nodestat, by='IDPOP')[, list(IDCLU_N=length(which(!is.na(TIME_SEQ)))), by='IDCLU'], by='IDCLU')
tmp[, LABEL_NEW:= tmp[, paste(IDCLU,'_',IDCLU_N,label.sep,LABEL,sep='')]]
setkey(tmp, LABEL)
phd$tip.label <- tmp[ phd$tip.label, ][, LABEL_NEW]
file <- paste(indir.epi, '/', substr(infile.epi,1,nchar(infile.epi)-6),'DATEDTREE.pdf', sep='')
cat(paste('\nWrite to file=',file))
pdf(file=file, w=10, h=Ntip(phd)*0.1)
plot(phd, show.tip=TRUE, cex=0.5)
dev.off()
}
}
#
# draw ancestral sequences and add to df.ph
#
if(pipeline.args['startseq.mode',][,v]=='many')
{
root.ctime <- df.ph[, ROOT_CALENDAR_TIME]
ancseq <- rANCSEQ(root.ctime, rANCSEQ.args)
ancseq <- data.table(ANCSEQ= apply(as.character(ancseq),1,function(x) paste(x, collapse='')) )
}
if(pipeline.args['startseq.mode',][,v]=='one')
{
cat(paste('\nStartSeqModel=',pipeline.args['startseq.mode',][,v],'use first sampled starting sequence for all' ))
stopifnot(max(abs(df.ph[1, ROOT_CALENDAR_TIME]-df.ph[, ROOT_CALENDAR_TIME]))<100*EPS)
root.ctime <- df.ph[1, ROOT_CALENDAR_TIME]
tmp <- rANCSEQ(root.ctime, rANCSEQ.args)
tmp <- data.table(ANCSEQ= apply(as.character(tmp),1,function(x) paste(x, collapse='')) )
ancseq <- data.table(ANCSEQ=rep(NA_character_, nrow(df.ph)))
set(ancseq, NULL, 'ANCSEQ', tmp[1,ANCSEQ])
}
df.ph <- cbind(df.ph, ancseq)
#
# create SEQ-GEN input file
#
partition.len <- c( ncol(rANCSEQ.args$anc.seq.gag), ncol(rANCSEQ.args$anc.seq.pol), ncol(rANCSEQ.args$anc.seq.env) )
#partition.er <- c( 2.5, 4, 5 )
# split ancestral sequence into GENE and CODON_POS
df.ph[, ANCSEQ.GAG:= substr(ANCSEQ, 1, partition.len[1])]
df.ph[, ANCSEQ.POL:= substr(ANCSEQ, partition.len[1]+1, partition.len[1]+partition.len[2])]
df.ph[, ANCSEQ.ENV:= substr(ANCSEQ, partition.len[1]+partition.len[2]+1, partition.len[1]+partition.len[2]+partition.len[3])]
stopifnot( all( strsplit( df.ph[1, ANCSEQ], '')[[1]] == strsplit( paste( df.ph[1, ANCSEQ.GAG], df.ph[1, ANCSEQ.POL], df.ph[1, ANCSEQ.ENV], sep=''), '' )[[1]] ) )
df.ph[, ANCSEQ:=NULL]
df.ph <- melt(df.ph, id.var=c('ROOT_CALENDAR_TIME','IDCLU','NEWICK'), value.name='ANCSEQ', variable.name='GENE', variable.factor=FALSE)
set(df.ph, NULL, 'GENE', df.ph[, substr(GENE, 8, nchar(GENE))])
df.ph <- df.ph[, {
tmp <- strsplit(ANCSEQ, '')
list( ANCSEQ.CP1= sapply(tmp, function(x) paste(x[seq.int(1,nchar(ANCSEQ[1]),3)],collapse='') ),
ANCSEQ.CP2= sapply(tmp, function(x) paste(x[seq.int(2,nchar(ANCSEQ[1]),3)],collapse='') ),
ANCSEQ.CP3= sapply(tmp, function(x) paste(x[seq.int(3,nchar(ANCSEQ[1]),3)],collapse='') ), ROOT_CALENDAR_TIME=ROOT_CALENDAR_TIME, IDCLU=IDCLU, NEWICK=NEWICK )
}, by='GENE']
df.ph <- melt(df.ph, id.var=c('ROOT_CALENDAR_TIME','IDCLU','NEWICK','GENE'), value.name='ANCSEQ', variable.name='CODON_POS', variable.factor=FALSE)
set(df.ph, NULL, 'CODON_POS', df.ph[, substr(CODON_POS, 8, nchar(CODON_POS))])
#
# save to file all we need to call SeqGen
#
file <- paste(outdir.sg,'/',infile.prefix, 'seqgen.R',sep='')
cat(paste('\nsave to file=',file))
df.seqgen <- df.ph
save(df.seqgen, log.df, df.nodestat, file=file)
}
######################################################################################
# Program to simulate sequences with Seq-Gen-1.3.3
# olli originally written 27-01-2015
######################################################################################
#' @export
PANGEA.SeqGen.run.v4<- function(indir.epi, infile.epi, indir.sg, infile.prefix, outdir, pipeline.args)
{
set.seed(pipeline.args['s.seed',][, as.numeric(v)])
#
file <- paste(indir.epi, '/', infile.epi, sep='')
load(file) #expect "df.epi" "df.trms" "df.inds" "df.sample" "df.sp"
#
file <- paste(indir.sg,'/',infile.prefix, 'seqgen.R',sep='')
cat(paste('\nLoad seqgen R input file, file=',file))
load(file) #expect df.seqgen, gtr.central, log.df, df.nodestat
#
if( pipeline.args['er.gtr',][,v]=='GTR_POL_CP1' )
{
cat('\nUsing rate model GTR_POL_CP1: use pre-estimated evolutionary rate and GTR rates from pol, codon position 1')
# reset rate models for gag, env and codons cp2 cp3
tmp <- subset(log.df, GENE=='POL' & CODON_POS=='CP1')
set(tmp, NULL, 'mu', 1)
set(tmp, NULL, c('GENE','CODON_POS','IDX'), NULL)
tmp[, DUMMY:=1L]
log.df <- unique(subset(log.df, select=c(GENE, CODON_POS)), by=c('GENE','CODON_POS'))
log.df[, DUMMY:=1L]
log.df <- merge(log.df, tmp, by='DUMMY',allow.cartesian=TRUE)
log.df[, DUMMY:=NULL]
log.df[, IDX:=seq_len(nrow(log.df))]
}
if( pipeline.args['index.starttime.mode',][,v]=='shift' )
root.edge.rate <- 1e-6
if( pipeline.args['index.starttime.mode',][,v]!='shift' )
root.edge.rate <- log.df[1,meanRate]
stopifnot( all( c('s.seed')%in%pipeline.args[, stat] ) )
#
# create SeqGen input files
#
#z<- read.tree(text= subset(df.seqgen, IDCLU==1)[, NEWICK] )
#z<- read.tree(text= subset(df.seqgen, IDCLU==1 & GENE=='GAG' & CODON_POS=='CP1')[, NEWICK] )
df.ph.out <- df.seqgen[, {
#print( table( strsplit(ANCSEQ, ''), useNA='if') )
file <- paste(indir.sg,'/',infile.prefix, IDCLU, '_', GENE, '_', CODON_POS,'.seqgen',sep='')
cat(paste('\nwrite to file',file))
txt <- paste('1\t',nchar(ANCSEQ),'\n',sep='')
txt <- paste(txt, 'ANCSEQ\t',toupper(ANCSEQ),'\n',sep='')
txt <- paste(txt, '1\n',sep='')
txt <- paste(txt, NEWICK, '\n',sep='')
cat(txt, file=file)
list(FILE= paste(infile.prefix, IDCLU, '_', GENE, '_', CODON_POS,'.seqgen',sep='') )
# ./seq-gen -mHKY -t3.0 -f0.3,0.2,0.2,0.3 -n1 -k1 -on < /Users/Oliver/git/HPTN071sim/tmp/SeqGen/140716_RUN001_50.seqgen > example.nex
}, by=c('GENE','CODON_POS','IDCLU')]
df.ph.out <- df.ph.out[, {
tmp <- log.df[['IDX']][ which( log.df[['GENE']]==GENE & log.df[['CODON_POS']]==CODON_POS ) ]
stopifnot(length(tmp)>1)
list( FILE=FILE, IDCLU=IDCLU, IDX=sample(tmp, length(FILE), replace=FALSE) )
}, by=c('GENE','CODON_POS')]
# sample GTR parameters from posterior
df.ph.out <- merge(df.ph.out, log.df, by=c('GENE', 'CODON_POS', 'IDX'))
stopifnot( nrow(df.ph.out)<65535 ) # running out of seeds?
df.ph.out[, SEED:=sample.int(65535, nrow(df.ph.out))]
#
if(with.plot)
{
tmp <- subset(df.nodestat, ER!=root.edge.rate, select=c(IDPOP, ER, BWM, IDCLU))
tmp <- merge(tmp, subset(df.ph.out, select=c(GENE, CODON_POS, IDCLU, mu)), by='IDCLU', allow.cartesian=TRUE)
set(tmp, NULL, 'ER', tmp[, ER*mu])
ggplot(tmp, aes(x=CODON_POS, y=ER, colour=CODON_POS, group=CODON_POS)) + geom_boxplot() +
facet_grid(.~GENE, scales='free_y') +
scale_colour_discrete(guide=FALSE) +
scale_y_continuous(breaks= seq(0, 0.05, 0.002)) + labs(linetype='Gene', y='simulated within-host evolutionary rate', x='codon position')
file <- paste(indir.sg,'/',infile.prefix, 'ER_by_gene.pdf',sep='')
ggsave(file=file, w=6, h=6)
}
#
# call SeqGen command line
#
cat(paste('\nUsing Gamma rate variation, gamma=',pipeline.args['er.gamma',][, as.numeric(v)]))
tmp <- df.ph.out[, {
cat(paste('\nProcess', IDCLU, GENE, CODON_POS))
cmd <- cmd.SeqGen(indir.sg, FILE, indir.sg, gsub('seqgen','phy',FILE), prog=PR.SEQGEN, prog.args=paste('-n',1,' -k1 -or -z',SEED,sep=''),
alpha=alpha, gamma=pipeline.args['er.gamma',][, as.numeric(v)], invariable=0, scale=mu, freq.A=a, freq.C=c, freq.G=g, freq.T=t,
rate.AC=ac, rate.AG=ag, rate.AT=at, rate.CG=cg, rate.CT=1, rate.GT=gt)
system(cmd)
list(CMD=cmd)
}, by='FILE']
#
# check for non-zero length and if necessary repeat seq-gen
#
infiles <- list.files(indir.sg)
infiles <- infiles[ grepl('*phy$', infiles) ]
if(!length(infiles)) stop('cannot find files matching criteria')
infile.df <- data.table(FILE=infiles)
infile.df <- infile.df[, list(SIZE= file.size(file.path(indir.sg, FILE))), by='FILE']
tmp <- infile.df[, strsplit(FILE, '_') ]
infile.df[, CODON_POS:= gsub('.phy','',sapply(tmp, function(x) rev(x)[label.idx.codonpos]))]
infile.df[, GENE:= sapply(tmp, function(x) rev(x)[label.idx.gene])]
infile.df[, IDCLU:= as.integer(sapply(tmp, function(x) rev(x)[label.idx.clu]))]
tmp <- subset(infile.df, SIZE==0, c(IDCLU, GENE, CODON_POS))
tmp <- merge(tmp, df.ph.out, by=c('IDCLU','GENE','CODON_POS'))
if(nrow(tmp))
{
cat('Found simulated phy files with zero bytes -- try and repeat')
invisible(tmp[, {
cat(paste('\nProcess', IDCLU, GENE, CODON_POS))
cmd <- cmd.SeqGen(indir.sg, FILE, indir.sg, gsub('seqgen','phy',FILE), prog=PR.SEQGEN, prog.args=paste('-n',1,' -k1 -or -z',SEED,sep=''),
alpha=alpha, gamma=pipeline.args['er.gamma',][, as.numeric(v)], invariable=0, scale=mu, freq.A=a, freq.C=c, freq.G=g, freq.T=t,
rate.AC=ac, rate.AG=ag, rate.AT=at, rate.CG=cg, rate.CT=1, rate.GT=gt)
system(cmd)
list(CMD=cmd)
}, by='FILE'])
}
#
# process SeqGen runs
#
# collect simulated sequences
infiles <- list.files(indir.sg)
infiles <- infiles[ grepl('*phy$', infiles) ]
if(!length(infiles)) stop('cannot find files matching criteria')
infile.df <- data.table(FILE=infiles)
infile.df <- infile.df[, list(SIZE= file.size(file.path(indir.sg, FILE))), by='FILE']
stopifnot( !nrow(subset(infile.df, SIZE==0)) ) #abort if there are still zero byte files
tmp <- infile.df[, strsplit(FILE, '_') ]
infile.df[, CODON_POS:= sapply(tmp, function(x) rev(x)[label.idx.codonpos])]
infile.df[, GENE:= sapply(tmp, function(x) rev(x)[label.idx.gene])]
infile.df[, IDCLU:= sapply(tmp, function(x) rev(x)[label.idx.clu])]
set(infile.df, NULL, 'CODON_POS', infile.df[, substr(CODON_POS,1,3)])
cat(paste('\nFound sequences for clusters, nclu=', infile.df[, length(unique(IDCLU))]))
#
# read simulated sequences
#
df.seq <- infile.df[, {
cat(paste('\nread seq in file',FILE))
file <- paste(indir.sg,'/',FILE,sep='')
tmp <- as.character(read.dna(file, format='sequential'))
list( SEQ=apply(tmp,1,function(x) paste(x, collapse='')), LABEL=rownames(tmp) )
}, by='FILE']
df.seq <- merge(df.seq, infile.df, by='FILE')
#
# reconstruct genes from codon positions
#
df.seq[, STAT:=paste(GENE,CODON_POS,sep='.')]
df.seq <- dcast.data.table(df.seq, IDCLU + LABEL ~ STAT, value.var="SEQ")
# check that seq of correct size
stopifnot( df.seq[, all( nchar(GAG.CP1)==nchar(GAG.CP2) & nchar(GAG.CP1)==nchar(GAG.CP3) )] )
stopifnot( df.seq[, all( nchar(POL.CP1)==nchar(POL.CP2) & nchar(POL.CP1)==nchar(POL.CP3) )] )
stopifnot( df.seq[, all( nchar(ENV.CP1)==nchar(ENV.CP2) & nchar(ENV.CP1)==nchar(ENV.CP3) )] )
#
df.seq <- df.seq[, {
tmp <- do.call('rbind',sapply(list(ENV.CP1,ENV.CP2,ENV.CP3), strsplit, ''))
env <- paste(as.vector(tmp), collapse='')
tmp <- do.call('rbind',sapply(list(GAG.CP1,GAG.CP2,GAG.CP3), strsplit, ''))
gag <- paste(as.vector(tmp), collapse='')
tmp <- do.call('rbind',sapply(list(POL.CP1,POL.CP2,POL.CP3), strsplit, ''))
pol <- paste(as.vector(tmp), collapse='')
list(GAG=gag, POL=pol, ENV=env, IDCLU=IDCLU)
}, by=c('LABEL')]
# check that we have indeed seq for all sampled individuals
df.seq <- subset(df.seq, !grepl('NOEXIST',LABEL))
tmp <- df.seq[, sapply( strsplit(LABEL, treelabel.idx.sep, fixed=TRUE), '[[', treelabel.idx.idpop )]
df.seq[, IDPOP:=as.integer(substr(tmp,7,nchar(tmp)))]
stopifnot( setequal( subset( df.inds, !is.na(TIME_SEQ) )[, IDPOP], df.seq[,IDPOP]) )
#
# save simulated data -- internal
#
file <- paste(outdir, '/', infile.prefix, 'SIMULATED_INTERNAL.R', sep='')
cat(paste('\nwrite to file', file))
save(df.epi, df.trms, df.inds, df.sample, df.seq, df.sp, file=file)
#
# save simulated data -- to be shared
#
if(pipeline.args['epi.model'][,v]=='HPTN071')
{
tmp <- subset( df.inds, !is.na(TIME_SEQ), select=c(IDPOP, GENDER, DOB, DOD, DIAG_T, DIAG_CD4, ART1_T, ART1_CD4, TIME_SEQ, RECENT_TR ) )
tmp2 <- tmp[, which(!is.na(RECENT_TR))]
cat(paste('\nSet RECENT_TR to missing for p=',1-pipeline.args['report.prop.recent',][,as.numeric(v)]))
tmp2 <- sample(tmp2, round(length(tmp2)*(1-pipeline.args['report.prop.recent',][,as.numeric(v)])))
set(tmp, NULL, 'RECENT_TR', tmp[, as.character(RECENT_TR)])
set(tmp, tmp2, 'RECENT_TR', NA_character_)
set(tmp, NULL, 'RECENT_TR', tmp[, factor(RECENT_TR)])
set(tmp, NULL, 'GENDER', tmp[,as.character(GENDER)])
tmp2 <- tmp[, which(is.na(DIAG_T) & TIME_SEQ<2000)]
cat(paste('\nSet patient variables to NA for archival seq, n=',length(tmp2)))
set(tmp, tmp2, c('DOB','DOD'), NA_real_)
set(tmp, tmp2, 'GENDER', NA_character_)
tmp2 <- tmp[, which(is.na(DIAG_T) & TIME_SEQ>=2000)]
cat(paste('\nSet patient variables to NA after 200, n=',length(tmp2)))
print(tmp[tmp2,])
set(tmp, tmp2, c('DOB','DOD'), NA_real_)
set(tmp, tmp2, 'GENDER', NA_character_)
set(tmp, NULL, 'GENDER', tmp[,factor(GENDER)])
}
if(pipeline.args['epi.model'][,v]=='DSPS')
tmp <- subset( df.inds, !is.na(TIME_SEQ), select=c(IDPOP, GENDER, TIME_SEQ ) )
file <- paste(outdir, '/', infile.prefix, 'SIMULATED_metadata.csv', sep='')
cat(paste('\nwrite to file', file))
write.csv(tmp, file)
tmp <- tolower(do.call('rbind',strsplit(df.seq[, GAG],'')))
rownames(tmp) <- df.seq[, LABEL]
df.seq.gag <- as.DNAbin(tmp)
file <- paste(outdir, '/', infile.prefix, 'SIMULATED_gag.fa', sep='')
write.dna(df.seq.gag, file, format = "fasta")
tmp <- tolower(do.call('rbind',strsplit(df.seq[, POL],'')))
rownames(tmp) <- df.seq[, LABEL]
df.seq.pol <- as.DNAbin(tmp)
file <- paste(outdir, '/', infile.prefix, 'SIMULATED_pol.fa', sep='')
write.dna(df.seq.pol, file, format = "fasta")
tmp <- tolower(do.call('rbind',strsplit(df.seq[, ENV],'')))
rownames(tmp) <- df.seq[, LABEL]
df.seq.env <- as.DNAbin(tmp)
file <- paste(outdir, '/', infile.prefix, 'SIMULATED_env.fa', sep='')
write.dna(df.seq.env, file, format = "fasta")
df.seq <- merge(df.seq, df.seq[, list(IDCLU_N=length(IDPOP)), by='IDCLU'], by='IDCLU')
if(with.NJ)
{
#
# create and plot NJ tree on conc seq
#
# load outgroup sequences
file <- system.file(package="PANGEA.HIV.sim", "misc",'PANGEA_SSAfg_HXB2outgroup.R')
cat(paste('\nLoading outgroup seq from file', file))
load(file) #expect "outgroup.seq.gag" "outgroup.seq.pol" "outgroup.seq.env"
if(nrow(df.seq)<2000)
df.seq.nj <- copy(df.seq)
if(nrow(df.seq)>=2000)
{
cat(paste('\nToo many seqs for quick NJ tree calculation, selecting first 2000'))
df.seq.nj <- unique(subset(df.seq, select=c(IDCLU, IDCLU_N)), by=c('IDCLU','IDCLU_N'))
df.seq.nj[, IDCLU_CN:=df.seq.nj[, cumsum(IDCLU_N)]]
df.seq.nj <- df.seq.nj[seq_len(max(1, which(IDCLU_CN>2000)[1]-1)), ]
df.seq.nj <- merge(subset(df.seq.nj, select=IDCLU), df.seq, by='IDCLU')
}
# concatenate sequences
df.seq.nj[, TIPCOLOR:='black']
set(df.seq.nj, df.seq.nj[,which(IDCLU_N<3)],'TIPCOLOR','red')
tmp <- tolower(do.call('rbind',strsplit(df.seq.nj[, paste(GAG,POL,ENV,sep='')],'')))
rownames(tmp) <- df.seq.nj[, paste(IDCLU,'_',IDCLU_N,treelabel.idx.sep,LABEL,sep='')]
seq <- as.DNAbin(tmp)
tmp <- cbind(outgroup.seq.gag[,1:ncol(df.seq.gag)], outgroup.seq.pol, outgroup.seq.env)
seq <- rbind(seq,tmp)
seq.ph <- nj(dist.dna(seq, model='raw'))
tmp <- which(seq.ph$tip.label=="HXB2")
seq.ph <- reroot(seq.ph, tmp, seq.ph$edge.length[which(seq.ph$edge[,2]==tmp)])
seq.ph <- ladderize(seq.ph)
file <- paste(indir.sg, '/', infile.prefix, 'INFO_NJconc.pdf', sep='')
cat(paste('\nwrite to file',file))
pdf(file=file, w=10, h=0.1*Ntip(seq.ph))
plot(seq.ph, show.tip=TRUE, cex=0.5, tip.color=df.seq.nj[,TIPCOLOR])
dev.off()
#
# create and plot NJ tree on pol seq
#
tmp <- tolower(do.call('rbind',strsplit(df.seq.nj[, POL],'')))
rownames(tmp) <- df.seq.nj[, paste(IDCLU,'_',IDCLU_N,treelabel.idx.sep,LABEL,sep='')]
seq <- as.DNAbin(tmp)
seq <- rbind(seq,outgroup.seq.pol)
seq.ph <- nj(dist.dna(seq, model='raw'))
tmp <- which(seq.ph$tip.label=="HXB2")
seq.ph <- reroot(seq.ph, tmp, seq.ph$edge.length[which(seq.ph$edge[,2]==tmp)])
seq.ph <- ladderize(seq.ph)
file <- paste(indir.sg, '/', infile.prefix, 'INFO_NJpol.pdf', sep='')
cat(paste('\nwrite to file',file))
pdf(file=file, w=10, h=0.1*Ntip(seq.ph))
plot(seq.ph, show.tip=TRUE, cex=0.5, tip.color=df.seq.nj[,TIPCOLOR])
dev.off()
}
#
# zip simulated sequence files
#
tmp <- c( paste(outdir, '/', infile.prefix, 'SIMULATED_metadata.csv', sep=''), paste(indir.epi, '/', gsub('SAVE.R','CROSS_SECTIONAL_SURVEY.csv',infile.epi), sep=''), paste(outdir, '/', infile.prefix, 'SIMULATED_env.fa', sep=''), paste(outdir, '/', infile.prefix, 'SIMULATED_pol.fa', sep=''), paste(outdir, '/', infile.prefix, 'SIMULATED_gag.fa', sep='') )
zip( paste(outdir, '/', infile.prefix, 'SIMULATED_SEQ.zip', sep=''), tmp, flags = "-r9XTj")
#
# zip simulated tree files
#
tmp2 <- c( paste(outdir, '/', infile.prefix, 'SIMULATED_metadata.csv', sep=''), paste(indir.epi, '/', gsub('SAVE.R','CROSS_SECTIONAL_SURVEY.csv',infile.epi), sep=''),
paste(indir.epi, '/', infile.prefix, 'DATEDTREE.newick', sep=''),
paste(indir.epi, '/', infile.prefix, 'DATEDCLUTREES.newick', sep=''),
paste(indir.epi, '/', infile.prefix, 'SUBSTTREE.newick', sep=''))
zip( paste(outdir, '/', infile.prefix, 'SIMULATED_TREE.zip', sep=''), tmp2, flags = "-r9XTj")
#
tmp <- unique(c(tmp, tmp2))
dummy <- file.remove(tmp)
#
# zip internal files
#
tmp <- list.files(outdir, pattern='*pdf$', recursive=TRUE, full.names=TRUE)
file.copy(tmp, outdir, overwrite=TRUE)
tmp <- c( paste(outdir, '/', infile.prefix, 'SIMULATED_INTERNAL.R', sep=''), list.files(outdir, pattern='*pdf$', recursive=FALSE, full.names=TRUE) )
zip( paste(outdir, '/', infile.prefix, 'INTERNAL.zip', sep=''), tmp, flags = "-r9XTj")
dummy <- file.remove(tmp)
return(1)
}
##--------------------------------------------------------------------------------------------------------
## olli originally written 26-01-2015
##--------------------------------------------------------------------------------------------------------
#' @export
PANGEA.HPTN071.input.parser.v4<- function(indir, infile.ind, infile.trm, outdir, outfile.ind, outfile.trm, pipeline.args, verbose=1, with.plot=1)
{
stopifnot( all( c('yr.start', 'yr.end', 's.seed', 's.PREV.min', 's.PREV.max', 'epi.dt', 'epi.import')%in%pipeline.args[, stat] ) )
#
infile.ind <- paste(indir, '/', infile.ind, sep='')
infile.trm <- paste(indir, '/', infile.trm, sep='')
outfile.ind <- paste(outdir, '/', outfile.ind, sep='')
outfile.trm <- paste(outdir, '/', outfile.trm, sep='')
# set seed
set.seed( pipeline.args['s.seed',][, as.numeric(v)] )
#
# prepare transmissions
#
df.trm <- as.data.table(read.csv(infile.trm, stringsAsFactors=FALSE, sep=',', dec='.'))
setnames(df.trm, c("IdInfector","IdInfected","TimeOfInfection","IsInfectorAcute"), c('IDTR','IDREC','TIME_TR','TR_ACUTE'))
stopifnot( df.trm[, !any(is.na(TR_ACUTE))] )
set(df.trm, df.trm[, which(TR_ACUTE<0)], 'TR_ACUTE', NA_integer_)
set(df.trm, NULL, 'TR_ACUTE', df.trm[, factor(TR_ACUTE, levels=c(0,1), labels=c('No','Yes'))])
# transmissions happen either at baseline, or at unique times.
# the epi simulation allocates transmissions in 1/48 of a year, so draw a uniform number if there are more transmission per TIME_TR
tmp <- df.trm[, range(TIME_TR)]
df.trm <- df.trm[, {
z<- TIME_TR
if(length(IDTR)>1)
z<- sort(runif(length(IDTR), z, min(ceiling(tmp[2]), z+pipeline.args['epi.dt',][, as.numeric(v)])))
list(IDTR=IDTR, IDREC=IDREC, TIME_TR.new=z, TR_ACUTE=TR_ACUTE, l=length(IDTR))
}, by='TIME_TR']
df.trm[, TIME_TR:=NULL]
setnames(df.trm, 'TIME_TR.new', 'TIME_TR')
# stop if not all transmission times are unique, except at the end which does not matter
stopifnot( subset(df.trm, TIME_TR<max(TIME_TR))[, length(unique(TIME_TR))==length(TIME_TR)] )
# set baseline cases as negative ID
tmp <- df.trm[, which(IDTR=='-1')]
cat(paste('\nFound index cases, n=', length(tmp)))
set(df.trm, tmp, 'IDTR', rev(-seq_along(tmp)))
cat(paste('\nFound transmissions, n=', nrow(df.trm)))
cat(paste('\nTotal transmitters, n=', df.trm[, length(unique(IDTR))]))
cat(paste('\nTotal index cases, n=', df.trm[, length(which(unique(IDTR)<0))]))
#
# prepare patient metavariables
#
df.ind <- as.data.table(read.csv(infile.ind, stringsAsFactors=FALSE))
setnames(df.ind, c( "Id","Sex","DoB","DoD","RiskGp"), c('IDPOP','GENDER','DOB','DOD','RISK'))
setnames(df.ind, c( "HIV_pos", "t_diagnosed","cd4_diagnosis","cd4atfirstART","t_1stARTstart","t_1stVLsupp_start","t_1stVLsupp_stop"),
c( 'HIV', 'DIAG_T','DIAG_CD4','ART1_CD4','ART1_T',"VLS1_TS","VLS1_TE"))
set(df.ind, NULL, 'GENDER', df.ind[, factor(GENDER)])
set(df.ind, NULL, 'RISK', df.ind[, factor(RISK)])
stopifnot( df.ind[, !any(is.na(DOB))] )
set(df.ind, df.ind[, which(DOD==-1)], 'DOD', ceiling(max(df.trm$TIME_TR))+1.)
set(df.ind, NULL, 'HIV', df.ind[, factor(HIV, levels=c(0,1), labels=c('N','Y'))])
cat(paste('\nFound HIV+, n=', df.ind[, length(which(HIV=='Y'))]))
set(df.ind, df.ind[, which(DIAG_T=='ND')], 'DIAG_T', NA_character_)
set(df.ind, NULL, 'DIAG_T', df.ind[, as.numeric(DIAG_T)])
set(df.ind, df.ind[, which(DIAG_CD4<0)], 'DIAG_CD4', NA_real_)
stopifnot( df.ind[, !any(HIV=='N' & !is.na(DIAG_T)) ] )
stopifnot( df.ind[, !any(!is.na(DIAG_T) & is.na(DIAG_CD4))] )
cat(paste('\nFound not % undiagnosed , diagnosed=', paste( subset(df.ind, HIV=='Y')[, round(table(!is.na(DIAG_T)) / length(DIAG_T), d=2)], collapse=', ' )))
set(df.ind, df.ind[, which(ART1_CD4<0)], 'ART1_CD4', NA_real_)
set(df.ind, df.ind[, which(ART1_T<0)], 'ART1_T', NA_real_)
stopifnot( df.ind[, all(DIAG_T<ART1_T, na.rm=TRUE)] )
cat(paste('\nFound not % on ART , not on ART among diagnosed=', paste(subset(df.ind, !is.na(DIAG_T))[, round( table(is.na(ART1_T))/length(ART1_T), d=2)], collapse=', ') ))
set(df.ind, df.ind[, which(VLS1_TS<0)], 'VLS1_TS', NA_real_)
set(df.ind, df.ind[, which(VLS1_TE<0)], 'VLS1_TE', NA_real_)
tmp <- df.ind[, which(ART1_T>=VLS1_TS)]
cat(paste('\nFound ART1_T<VLS1_TS, setting VLS1_TS and VLS1_TE to NA, n=', length(tmp)))
set(df.ind, tmp, 'VLS1_TS', NA_real_)
set(df.ind, tmp, 'VLS1_TE', NA_real_)
stopifnot( df.ind[, all(ART1_T<VLS1_TS, na.rm=TRUE)] )
stopifnot( df.ind[, all(VLS1_TS<VLS1_TE, na.rm=TRUE)] )
cat(paste('\nFound not % reached viral suppression , did not reach viral suppression among treated=', paste(subset(df.ind, !is.na(ART1_T))[, round( table(is.na(VLS1_TS))/length(VLS1_TS), d=2)], collapse=', ') ))
tmp <- df.ind[, which(DIAG_CD4<ART1_CD4-DIAG_CD4*0.5 & DIAG_CD4>250)]
cat(paste('\nFound individuals whose CD4 at ART start is much higher than at diagnosis, n=', length(tmp)))
#stopifnot( df.ind[, all(DIAG_CD4>ART1_CD4, na.rm=TRUE)] )
stopifnot( df.ind[, all(DIAG_CD4>0, na.rm=TRUE)] )
stopifnot( df.ind[, all(ART1_CD4>0, na.rm=TRUE)] )
# add transmission time
tmp <- subset(df.trm, select=c(IDREC, TIME_TR))
setnames(tmp, 'IDREC','IDPOP')
df.ind <- merge(df.ind, tmp, by='IDPOP', all.x=TRUE)
stopifnot( df.ind[, all(TIME_TR<DIAG_T, na.rm=TRUE)] )
stopifnot( df.ind[, !any(is.na(TIME_TR) & !is.na(DIAG_T))] )
cat(paste('\nFound individuals with a valid record, n=', nrow(df.ind)))
cat(paste('\nFound individuals with an infection event, n=', nrow(subset(df.ind,!is.na(TIME_TR)))))
# reset times if needed, because TIME_TR got randomized by a small bit above
tmp <- df.ind[, which(DIAG_T<TIME_TR)]
set(df.ind, tmp, 'DIAG_T', df.ind[tmp, TIME_TR+(TIME_TR-DIAG_T)])
tmp <- df.ind[, which(ART1_T<DIAG_T)]
set(df.ind, tmp, 'ART1_T', df.ind[tmp, DIAG_T+(DIAG_T-ART1_T)])
stopifnot( df.ind[, all(ART1_T<VLS1_TS, na.rm=TRUE)] )
tmp <- df.ind[, which(DOD<TIME_TR)]
set(df.ind, tmp, 'DOD', df.ind[tmp, TIME_TR+(TIME_TR-DOD)])
# add if transmission in the last 6 mo
df.ind[, RECENT_TR:=df.ind[, factor(as.numeric((DIAG_T-TIME_TR)<.5), levels=c(0,1), labels=c('N','Y'))]]
# add time of infection of transmitter to df.trm
tmp <- subset(df.ind, select=c(IDPOP, TIME_TR))
setnames(tmp, c('IDPOP','TIME_TR'), c('IDTR','IDTR_TIME_INFECTED') )
setkey(tmp, IDTR)
df.trm <- merge(df.trm, unique(tmp, by='IDTR'), by='IDTR', all.x=TRUE)
stopifnot( df.trm[, !any(TIME_TR<=IDTR_TIME_INFECTED, na.rm=TRUE)] )
# simulate time individual ready for sequencing
df.ind <- PANGEA.Seqsampler.SimulateGuideToSamplingTimes.v2(df.ind, seqtime.mode= pipeline.args['seqtime.mode',][,v])
cat(paste('\nFound % sampled at or after ART start=', subset(df.ind, !is.na(DIAG_T))[, mean(!is.na(ART1_T) & T1_SEQ>=ART1_T, na.rm=TRUE)] ))
cat(paste('\nFound % sampled after end of first viral suppression=', subset(df.ind, !is.na(DIAG_T))[, mean(!is.na(VLS1_TE) & T1_SEQ>=VLS1_TE, na.rm=TRUE)] ))
#
#
# reduce to time frame of interest
#
#tmp <- subset( df.trm, TIME_TR>=as.numeric( pipeline.args['yr.end',][, as.numeric(v)] ) )[, IDREC]
df.trm <- subset( df.trm, TIME_TR<as.numeric( pipeline.args['yr.end',][, as.numeric(v)] ) )
#df.ind <- subset(df.ind, !IDPOP%in%tmp)
df.ind <- subset(df.ind, DOB<pipeline.args['yr.end',][, as.numeric(v)] )
df.ind <- subset(df.ind, is.na(DOD) | DOD >= floor(min(df.trm$TIME_TR)) )
cat(paste('\nFound individuals born before',pipeline.args['yr.end',][, as.numeric(v)],', n=', nrow(df.ind)))
cat(paste('\nFound transmissions before',pipeline.args['yr.end',][, as.numeric(v)],', n=', nrow(df.trm)))
cat(paste('\nTotal transmitters in sampling frame, n=', df.trm[, length(unique(IDTR))]))
stopifnot( length(setdiff( subset(df.trm, IDTR>0)[, IDTR], df.ind[, IDPOP] ))==0 )
stopifnot( length(setdiff( df.trm[, IDREC], df.ind[, IDPOP] ))==0 )
# simulate a fraction of transmissions to be imports
tmp <- PANGEA.ImportSimulator.SimulateIndexCase(df.ind, df.trm, epi.import= pipeline.args['epi.import',][,as.numeric(v)])
df.trm <- tmp$df.trm
df.ind <- tmp$df.ind
#
# set infection times for index case
#
# add IDTR_TIME_INFECTED for baseline cases
#tmp <- df.trm[, which(is.na(IDTR_TIME_INFECTED))]
#set( df.trm, tmp, 'IDTR_TIME_INFECTED', df.trm[tmp, runif(length(tmp), TIME_TR-5, TIME_TR)] )
tmp <- PANGEA.ImportSimulator.SimulateStartingTimeOfIndexCase.v2(df.ind, df.trm, index.starttime.mode= pipeline.args['index.starttime.mode',][,v])
df.trm <- tmp$df.trm
df.ind <- tmp$df.ind
#
# sample sequences
#
PANGEA.Seqsampler.v4(df.ind, df.trm, pipeline.args, outfile.ind, outfile.trm, with.plot=with.plot)
#
return(1)
}
######################################################################################
# return GAG POL ENV ancestral sequences from BEAST PARSER output
# olli originally written 13-08-2014
# tree beast trees in ape format, needed to compute calendar time for each ancestral sequence
# node.stat data.table containing meta information in nexus file for nodes
# return list of GAG POL ENV sequences in ape format
PANGEA.RootSeqSim.get.ancestral.seq<- function(tree, node.stat, tree.id.sep='_', tree.id.idx.mcmcit=2, tree.id.burnin=1, label.sep='|', label.idx.ctime=5)
{
tree.id <- names(tree)
# add calendar time for inner node at NODE_ID to node.stat
node.stat[, CALENDAR_TIME:=NA_real_]
setkey(node.stat, TREE_ID, NODE_ID)
for(i in seq_along(tree.id))
{
cat(paste('\nProcess CALENDAR_TIME for tree id', tree.id[i] ))
label.ctime <- sapply( strsplit(tree[[i]]$tip.label, label.sep, fixed=TRUE), '[[', label.idx.ctime )
label.ctime <- as.numeric(label.ctime)
depth <- node.depth.edgelength( tree[[ i ]] )
tmp <- which.max(depth)
depth <- depth-depth[tmp]+label.ctime[tmp]
tmp <- node.stat[, which(TREE_ID==tree.id[i])]
for(j in seq_along(depth))
{
set(node.stat, tmp[ node.stat[tmp, which(NODE_ID==j)] ], 'CALENDAR_TIME', depth[j])
}
}
tmp <- node.stat[, length(which(is.na(CALENDAR_TIME)))]
cat(paste('\nTotal node statistics with no CALENDAR_TIME [should be zero], n=', tmp ))
# keep only inner nodes
tmp <- sapply(tree, Ntip)
stopifnot(all(tmp==tmp[1]))
node.stat <- subset(node.stat, NODE_ID>tmp[1])
#
set(node.stat, NULL, 'VALUE', node.stat[, gsub('\"','',VALUE)])
# checks of ancseq before we proceed
tmp <- node.stat[, list(NSEQ= nchar(VALUE)), by=c('TREE_ID', 'NODE_ID', 'STAT')]
stopifnot( tmp[, list(CHECK= all(NSEQ==NSEQ[1])), by='STAT'][, all(CHECK)] )
set(tmp, NULL, 'GENE', tmp[, sapply(strsplit(STAT,'\\.'),'[[',1)])
set(tmp, NULL, 'CODON_POS', tmp[, sapply(strsplit(STAT,'\\.'),'[[',2)])
tmp <- dcast.data.table(tmp, TREE_ID + NODE_ID + GENE ~ CODON_POS, value.var='NSEQ')
tmp <- tmp[, list(CPM=min(CP1, CP2, CP3)), by=c('TREE_ID','NODE_ID','GENE')]
stopifnot( tmp[, list(CHECK=all(CPM==CPM[1])), by='GENE'][, all(CHECK)] )
setkey(tmp, GENE)
# truncate to following size of coding regions (if necessary)
tmp <- unique(tmp, by='GENE')[, list(STAT=paste(GENE,'.CP',1:3,sep=''), CPM=CPM), by='GENE']
node.stat <- merge(node.stat, subset(tmp, select=c(STAT, CPM)), by='STAT')
set(node.stat, NULL, 'VALUE', node.stat[, substr(VALUE,1,CPM)])
set(node.stat, NULL, 'CPM', NULL)
# reconstruct genes from codon positions
node.stat <- dcast.data.table(node.stat, TREE_ID + NODE_ID + CALENDAR_TIME ~ STAT, value.var="VALUE")
node.stat <- node.stat[, {
tmp <- do.call('rbind',sapply(list(ENV.CP1,ENV.CP2,ENV.CP3), strsplit, ''))
env <- paste(as.vector(tmp), collapse='')
tmp <- do.call('rbind',sapply(list(GAG.CP1,GAG.CP2,GAG.CP3), strsplit, ''))
gag <- paste(as.vector(tmp), collapse='')
tmp <- do.call('rbind',sapply(list(POL.CP1,POL.CP2,POL.CP3), strsplit, ''))
pol <- paste(as.vector(tmp), collapse='')
list(GAG=gag, POL=pol, ENV=env, CALENDAR_TIME=CALENDAR_TIME)
}, by=c('TREE_ID','NODE_ID')]
set(node.stat, NULL, 'LABEL', node.stat[, paste(TREE_ID, NODE_ID, round(CALENDAR_TIME,d=3), sep='|')])
# remove tree id STATE_xx where xx is smaller than burn-in
set(node.stat, NULL, 'BEAST_MCMC_IT', node.stat[, as.integer(sapply(strsplit(TREE_ID,tree.id.sep),'[[',tree.id.idx.mcmcit))])
node.stat <- subset(node.stat, BEAST_MCMC_IT>tree.id.burnin)
cat(paste('\nFound ancestral sequences, n=', nrow(node.stat) ))
node.stat
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.