#' Probability epidemic genetic simulator
#'
#' This function starts with a root nucelotide sequence and evolves it via
#' an epidemic generational model. The result can then be used elsehere to
#' generate newick trees and necessary enxus files for BEAUti/BEAST etc
#'
#'
#' @param fasta.file path to a fasta sequence to be used as the root, otherwise random sequence used
#' @param seq.length length of root sequence if not specified by fasta.file
#' @param N - total popualation size, such that N-I = S
#' @param mu - mutation root in base changes per generation time
#' @param R0 - basic reproduction number
#' @param Tg - mean generation time
#' @param kappa - the rate of transitions and transversions
#' @param model - substitution model - avaialble so far JC69, HKY85
#' @param I - Initial infected population
#' @param samples - 1/samples is equal to the random sample size of tips stored
#'
#' @return Returns a list of 9, with the first 7 lists giving the full infection network. As such
#' res$ID[n] corresponds to res$Time/Parent/IsFather/Offspring/Branch.Length/Sequence/[n]. res$Seq and
#' res$Seq.ID relate to eachother and the value within res$Sequence relates to res$Seq.ID and thus can be
#' used to map a sequence to each individual
#'
#'
#' @export
#'
#' @examples
#' ## simulate data
#' TODO(OJ)
#'
#'
Probability_Sim <- function(fasta.file=NULL,seq.length=1000,N=1000,mu=0.0001,R0=3,Tg=2,kappa=2,model="HKY85",I=1,samples=10,size=1,
fixed.offspring=TRUE,fixed.generations=FALSE,t.cont=FALSE,outgroup=FALSE){
## HANDLE VARIABLES ##
##########################################################
ptm <- proc.time()
## check declared model is valid
model.list <- c("JC69","K80","HKY85");
if(!model %in% model.list ) stop("Model given is not valid")
## handle arguments and create variables
susceptibles <- N-I;
nuc <- as.DNAbin(c("a","c","g","t"))
## EXTRA FUNCTIONS ##
##########################################################
## function to create sequence from scratch
seq_generate <- function(){
res <- sample(nuc, size=seq.length, replace=TRUE)
class(res) <- "DNAbin"
return(res)
}
## function to update results table
update_results <- function(Results,Offspring){
# Update Boolean showing ID is now a node
Results$IsFather[parent.id] <- TRUE;
Results$Offspring[parent.id] <- offspring
# Allocate children by first creating distribution of generation times
# Timing of new infections is drawn randomly from a poisson distribution with mean Tg
if(fixed.generations==TRUE){
times <- sort(rep(Tg,offspring))
} else {
if(t.cont==TRUE){
times <- sort(rgamma(n=offspring,shape=Tg,scale=scale))
} else {
times <- sort(rpois(n=offspring,lambda=Tg))
}
}
# Assign the parent and generation times to the children in order
# i.e. if node 4 & 5 are siblings, Tg[4] < Tg[5]
Results$Parent[child.id:(child.id+offspring-1)] <- parent.id
Results$Branch.Length[child.id:(child.id+offspring-1)] <- times
Results$Time[child.id:(child.id+offspring-1)] <- times+Results$Time[Results$Parent[child.id:(child.id+offspring-1)]]
return(Results)
}
## function to evaluate probability of mutation event
mutation_prob <- function(sequence){
# first assign the class of DNAbin to sequence (funny)
class(sequence)<-"DNAbin"
# second create the transition matrix P
if (model == "JC69"){
P <- JC69(mu,Tg)
} else if (model == "HKY85"){
P <- HKY85(mu, t = Tg, kappa=kappa, pi=base.freq(sequence))
} else if (model == "K80") {
P <- K80(mu, t = Tg, kappa=kappa)
}
# using the transition matrix and the absolute numbers of nucelotides caluclate the probability
# that no mutations occur
abs.freq <- base.freq(sequence,freq=TRUE)
res <- (P[1,1])^abs.freq[1]*(P[2,2])^abs.freq[2]*(P[3,3])^abs.freq[3]*(P[4,4])^abs.freq[4]
return(res)
}
## function to evolve sequence given succesful mutation event
seq_evolve <- function(sequence){
# first create the transition matrix P
if (model == "JC69"){
P <- JC69(mu,Tg)
} else if (model == "HKY85"){
P <- HKY85(mu = mu, t = Tg, kappa = kappa, pi = base.freq(sequence))
} else if (model =="K80") {
P <- K80(mu, t = Tg, kappa=kappa)
}
# using the transition matrix create a mutation matrix given we know a mutation occurs
mut.P <- P
diag(mut.P) <- 0
mut.mat <- Matrix::t(Matrix::t(mut.P)/Matrix::colSums(mut.P))
# randomly find nucelotide position and mutate it accordingly
nuc.pos <- round(runif(1,1,seq.length))
sequence <- s2n(as.character(sequence),base4 = FALSE)
sequence[nuc.pos] <- which(rmultinom(n=1,size=1,p=mut.mat[,sequence[nuc.pos]])==1)
sequence <- nuc[sequence]
# carry out any other mutations that may occur
res <- t(nuc[(which(apply(P[(s2n(as.character(sequence),base4 = FALSE)),], 1, rmultinom, n=1, size=1)==1)-(0:(sequence.length-1))*4)])
return(res)
}
## MAIN FUNCTION ##
##########################################################
## if no sequence is provided, generate random sequence
if(is.null(fasta.file)) {
root <- seq_generate()
} else {
## convert given fasta file path into DNAbin root
f <- read.fasta.format(fasta.file)
root <- nuc[s2n(f$org)+1]
}
sequence.length <- length(root)
## initialise results
Sequence.Results <- list(Seq = root,Seq.ID=1)
if(outgroup==FALSE){
Results <- list(ID=1:N, Time=rep(0,N), Parent=NA, IsFather = rep(FALSE,N), Offspring = NULL, Branch.Length = NULL, Sequence=1)
} else {
## initialise results
Results <- list(ID=1:(N+1), Time=rep(0,(N+1)), Parent=NA, IsFather = rep(FALSE,(N+1)), Offspring = NULL, Branch.Length = NULL, Sequence=1)
}
## set tickers, i.e. the first infected individual
## must be id #2 with parent id #1
parent.id <- 1
child.id <- 2
new.id <- 1
## create starting mutation probability
Prob.Mutation <- mutation_prob(sequence=root)
## create progress bar
##pb <- winProgressBar(title="Percentage of population infected", label="0% done", min=0, max=N, initial=1)
## INITIATE LOOP ##
##########################################################
while (susceptibles>0){
## Check to see if epidemic hasn't died out
if(parent.id==child.id) stop("Epidemic died out!")
## Calculate the number of new infections, randomly choosing from a negative binomial with mean R0
if(fixed.offspring==TRUE){
offspring = R0
} else {
offspring = rnbinom(1,size=size,mu = R0)
}
## If the infected indivdual is the root or the last parent ensure there are new infections
if (parent.id == 1 | (child.id-parent.id ==1)){
while (offspring == 0){
if(fixed.offspring==TRUE){
offspring = R0
} else {
offspring = rnbinom(1,size=size,mu = R0)
}
}
}
## If the infected individual is not the root 0 new infections are possible
if (offspring == 0) {
Results$Offspring[parent.id] = offspring
}
## Consider case when there are more susceptibles than offspring
else if (offspring > 0 && offspring < susceptibles){
# update infection network results
Results <- update_results(Results,offspring)
# determine if there has been a mutation event and if so evolve the sequence and calculate new prob.mutation
if (runif(1,0,1) > Prob.Mutation[Results$Sequence[parent.id]]){
new.id <- max(Sequence.Results$Seq.ID)+1
Results$Sequence[child.id:(child.id+offspring-1)] <- new.id
if (new.id==2){
new.seq <- seq_evolve(sequence = Sequence.Results$Seq[])
Sequence.Results$Seq <- rbind.DNAbin(t(Sequence.Results$Seq),new.seq)
} else {
new.seq <- seq_evolve(sequence = Sequence.Results$Seq[Results$Sequence[parent.id],])
Sequence.Results$Seq <- rbind.DNAbin(Sequence.Results$Seq,new.seq)
}
Prob.Mutation[new.id] <- mutation_prob(sequence = Sequence.Results$Seq[new.id,])
Sequence.Results$Seq.ID[new.id] <- new.id
} else {
Results$Sequence[child.id:(child.id+offspring-1)] <- Results$Sequence[parent.id]
}
## update the ID number of the next child
child.id <- child.id + offspring
}
## Consider case when there are more offspring than susceptibles
else {
## Cap number of offspring equal to available susceptibles
offspring <- susceptibles
## update infection network results
Results <- update_results(Results,offspring)
## determine if there has been a mutation event and if so evolve the sequence anc calculate new prob.mutation
if (runif(1,0,1) > Prob.Mutation[Results$Sequence[parent.id]]){
new.id <- max(Sequence.Results$Seq.ID)+1
Results$Sequence[child.id:(child.id+offspring-1)] <- new.id
if (new.id==2){
new.seq <- seq_evolve(sequence = Sequence.Results$Seq[])
Sequence.Results$Seq <- rbind.DNAbin(t(Sequence.Results$Seq),new.seq)
} else {
new.seq <- seq_evolve(sequence = Sequence.Results$Seq[Results$Sequence[parent.id],])
Sequence.Results$Seq <- rbind.DNAbin(Sequence.Results$Seq,new.seq)
}
Prob.Mutation[new.id] <- mutation_prob(sequence = Sequence.Results$Seq[new.id,])
Sequence.Results$Seq.ID[new.id] <- new.id
} else {
Results$Sequence[child.id:(child.id+offspring-1)] <- Results$Sequence[parent.id]
}
## update the ID number of the next child
child.id <- child.id + offspring
}
## update the change in susceptibles
susceptibles <- susceptibles - offspring
## update the ID number of the next parent node to be considered
parent.id <- parent.id + 1
## update progress bar
#info <- sprintf("%d%% done", round((((N-susceptibles)/N)*100)))
#setWinProgressBar(pb, round((((N-susceptibles)/N)*N)), label=info)
}
if (outgroup==TRUE){
correct.parent.id <- parent.id
parent.id <- 1
child.id <- N+1
offspring <- 1
new.id <- max(Sequence.Results$Seq.ID)+1
Results$Sequence[child.id:(child.id+offspring-1)] <- new.id
# Update Boolean showing ID is now a node
Results$IsFather[parent.id] <- TRUE;
Results$Offspring[parent.id] <- offspring
# Allocate children by first creating distribution of generation times
# Timing of new infections is drawn randomly from a poisson distribution with mean Tg
times <- max(Results$Time)
# Assign the parent and generation times to the children in order
# i.e. if node 4 & 5 are siblings, Tg[4] < Tg[5]
Results$Parent[child.id:(child.id+offspring-1)] <- parent.id
Results$Branch.Length[child.id:(child.id+offspring-1)] <- times
Results$Time[child.id:(child.id+offspring-1)] <- times+Results$Time[Results$Parent[child.id:(child.id+offspring-1)]]
## seq evolve to tip
sequence = Sequence.Results$Seq[Results$Sequence[parent.id],]
# first create the transition matrix P
if (model == "JC69"){
P <- JC69(mu,max(Results$Time))
} else if (model == "HKY85"){
P <- HKY85(mu = mu, t = max(Results$Time), kappa = kappa, pi = base.freq(sequence))
}
# using the transition matrix create a mutation matrix given we know a mutation occurs
mut.P <- P
diag(mut.P) <- 0
mut.mat <- Matrix::t(Matrix::t(mut.P)/Matrix::colSums(mut.P))
# randomly find nucelotide position and mutate it accordingly
nuc.pos <- round(runif(1,1,seq.length))
sequence <- s2n(as.character(sequence),base4 = FALSE)
sequence[nuc.pos] <- which(rmultinom(n=1,size=1,p=mut.mat[,sequence[nuc.pos]])==1)
sequence <- nuc[sequence]
# carry out any other mutations that may occur
new.seq <- t(nuc[(which(apply(P[(s2n(as.character(sequence),base4 = FALSE)),], 1, rmultinom, n=1, size=1)==1)-(0:(sequence.length-1))*4)])
Sequence.Results$Seq <- rbind.DNAbin(Sequence.Results$Seq,new.seq)
Prob.Mutation[new.id] <- mutation_prob(sequence = Sequence.Results$Seq[new.id,])
Sequence.Results$Seq.ID[new.id] <- new.id
parent.id <- correct.parent.id
}
## Check to see evolution as occurred!
if(new.id==1) stop("No Evolution - Perhaps root sequence is not realistic?")
row.names(Sequence.Results$Seq)= makeLabel(rep("seq",max(Sequence.Results$Seq.ID)),make.unique = TRUE)
## turn parent.id into a list to put into the result for the purposes of being able to figure out while tips are extant
Parent.ID <- list(parent.id=parent.id)
res <- append(Results,Sequence.Results)
res <- append(res,Parent.ID)
## print times
#close(pb)
print(paste(N,"individuals analysed, concerning a",length(root),"long sequence within",((proc.time() - ptm)[3]),"secs"))
return(res)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.