sims4: Epidemic generational simualator

#' Probability epidemic genetic simulator
#'
#' This function starts with a root nucelotide sequence and evolves it via
#' an epidemic generational model. The result can then be used elsehere to 
#' generate newick trees and necessary enxus files for BEAUti/BEAST etc 
#'
#'
#' @param fasta.file path to a fasta sequence to be used as the root, otherwise random sequence used
#' @param seq.length length of root sequence if not specified by fasta.file
#' @param N - total popualation size, such that N-I = S
#' @param mu - mutation root in base changes per generation time
#' @param R0 - basic reproduction number
#' @param Tg - mean generation time
#' @param kappa - the rate of transitions and transversions  
#' @param model - substitution model - avaialble so far JC69, HKY85
#' @param I - Initial infected population
#' @param samples - 1/samples is equal to the random sample size of tips stored 
#'
#' @return Returns a list of 9, with the first 7 lists giving the full infection network. As such
#' res$ID[n] corresponds to res$Time/Parent/IsFather/Offspring/Branch.Length/Sequence/[n]. res$Seq and 
#' res$Seq.ID relate to eachother and the value within res$Sequence relates to res$Seq.ID and thus can be 
#' used to map a sequence to each individual 
#' 
#' 
#' @export
#'
#' @examples
#' ## simulate data
#' TODO(OJ)
#' 
#'

Probability_Sim <- function(fasta.file=NULL,seq.length=1000,N=1000,mu=0.0001,R0=3,Tg=2,kappa=2,model="HKY85",I=1,samples=10,size=1,
                           fixed.offspring=TRUE,fixed.generations=FALSE,t.cont=FALSE,outgroup=FALSE){
  
  ## HANDLE VARIABLES ##
  ##########################################################
  ptm <- proc.time()
  
  ## check declared model is valid
  model.list <- c("JC69","K80","HKY85");
  if(!model %in% model.list ) stop("Model given is not valid")
  
  ## handle arguments and create variables
  susceptibles <- N-I;
  nuc <- as.DNAbin(c("a","c","g","t"))
  
  
  ## EXTRA FUNCTIONS ##
  ##########################################################
  
  ## function to create sequence from scratch
  seq_generate <- function(){
    res <- sample(nuc, size=seq.length, replace=TRUE)
    class(res) <- "DNAbin"
    return(res)
  }
  
  ## function to update results table
  update_results <- function(Results,Offspring){
    # Update Boolean showing ID is now a node
    Results$IsFather[parent.id] <- TRUE;
    Results$Offspring[parent.id] <- offspring
    
    # Allocate children by first creating distribution of generation times
    # Timing of new infections is drawn randomly from a poisson distribution with mean Tg
    if(fixed.generations==TRUE){
      times <- sort(rep(Tg,offspring))
    } else {
    if(t.cont==TRUE){
      times <- sort(rgamma(n=offspring,shape=Tg,scale=scale))
    } else {
      times <- sort(rpois(n=offspring,lambda=Tg))
    }
    }
    
    # Assign the parent and generation times to the children in order
    # i.e. if node 4 & 5 are siblings, Tg[4] < Tg[5]
    Results$Parent[child.id:(child.id+offspring-1)] <- parent.id
    Results$Branch.Length[child.id:(child.id+offspring-1)] <- times
    Results$Time[child.id:(child.id+offspring-1)] <- times+Results$Time[Results$Parent[child.id:(child.id+offspring-1)]]
    return(Results)
    
  }
  
  ## function to evaluate probability of mutation event
  mutation_prob <- function(sequence){
    # first assign the class of DNAbin to sequence (funny)
    class(sequence)<-"DNAbin"
    # second create the transition matrix P
    if (model == "JC69"){
      P <- JC69(mu,Tg)
    } else if (model == "HKY85"){
      P <- HKY85(mu, t = Tg, kappa=kappa, pi=base.freq(sequence))
    } else if (model == "K80") {
      P <- K80(mu, t = Tg, kappa=kappa)
    }
    # using the transition matrix and the absolute numbers of nucelotides caluclate the probability
    # that no mutations occur
    abs.freq <- base.freq(sequence,freq=TRUE)
    res <- (P[1,1])^abs.freq[1]*(P[2,2])^abs.freq[2]*(P[3,3])^abs.freq[3]*(P[4,4])^abs.freq[4]
    return(res)
  }
  
  ## function to evolve sequence given succesful mutation event
  seq_evolve <- function(sequence){
    # first create the transition matrix P
    if (model == "JC69"){
      P <- JC69(mu,Tg)
    } else if (model == "HKY85"){
      P <- HKY85(mu = mu, t = Tg, kappa = kappa, pi = base.freq(sequence))
    } else if (model =="K80") {
      P <- K80(mu, t = Tg, kappa=kappa)
    }
    # using the transition matrix create a mutation matrix given we know a mutation occurs
    mut.P <- P
    diag(mut.P) <- 0
    mut.mat <- Matrix::t(Matrix::t(mut.P)/Matrix::colSums(mut.P))
    
    # randomly find nucelotide position and mutate it accordingly
    nuc.pos <- round(runif(1,1,seq.length))
    sequence <- s2n(as.character(sequence),base4 = FALSE)
    sequence[nuc.pos] <- which(rmultinom(n=1,size=1,p=mut.mat[,sequence[nuc.pos]])==1)
    sequence <- nuc[sequence]
    
    # carry out any other mutations that may occur
    res <- t(nuc[(which(apply(P[(s2n(as.character(sequence),base4 = FALSE)),], 1, rmultinom, n=1, size=1)==1)-(0:(sequence.length-1))*4)])
    
    return(res)
  }
  
  ## MAIN FUNCTION ##
  ##########################################################
  
  ## if no sequence is provided, generate random sequence
  if(is.null(fasta.file)) {
    root <- seq_generate()
  } else {
    ## convert given fasta file path into DNAbin root
    f <- read.fasta.format(fasta.file)
    root <- nuc[s2n(f$org)+1]
  }
  sequence.length <- length(root)
  
  ## initialise results
  Sequence.Results <- list(Seq = root,Seq.ID=1)
  if(outgroup==FALSE){
    Results <- list(ID=1:N, Time=rep(0,N), Parent=NA, IsFather = rep(FALSE,N), Offspring = NULL, Branch.Length = NULL, Sequence=1) 
  } else {
    ## initialise results
    Results <- list(ID=1:(N+1), Time=rep(0,(N+1)), Parent=NA, IsFather = rep(FALSE,(N+1)), Offspring = NULL, Branch.Length = NULL, Sequence=1)  
  }
  
  ## set tickers, i.e. the first infected individual 
  ## must be id #2 with parent id #1
  parent.id <- 1
  child.id <- 2
  new.id <- 1
  ## create starting mutation probability
  Prob.Mutation <- mutation_prob(sequence=root)
  
  ## create progress bar
  ##pb <- winProgressBar(title="Percentage of population infected", label="0% done", min=0, max=N, initial=1)
  
  ## INITIATE LOOP ##
  ##########################################################
  
  while (susceptibles>0){
    
    ## Check to see if epidemic hasn't died out
    if(parent.id==child.id) stop("Epidemic died out!")
    
    ## Calculate the number of new infections, randomly choosing from a negative binomial with mean R0
    if(fixed.offspring==TRUE){
      offspring = R0
    } else {
      offspring = rnbinom(1,size=size,mu = R0) 
    }
    
    ## If the infected indivdual is the root or the last parent ensure there are new infections
    if (parent.id == 1 | (child.id-parent.id ==1)){
      while (offspring == 0){
        if(fixed.offspring==TRUE){
          offspring = R0
        } else {
          offspring = rnbinom(1,size=size,mu = R0) 
        }
      }
    }
    
    ## If the infected individual is not the root 0 new infections are possible
    if (offspring == 0) {
      Results$Offspring[parent.id] = offspring
    }
    
    ## Consider case when there are more susceptibles than offspring
    else if (offspring > 0 && offspring < susceptibles){
      
      # update infection network results
      Results <- update_results(Results,offspring)
      
      # determine if there has been a mutation event and if so evolve the sequence and calculate new prob.mutation
      if (runif(1,0,1) > Prob.Mutation[Results$Sequence[parent.id]]){
        new.id <- max(Sequence.Results$Seq.ID)+1
        Results$Sequence[child.id:(child.id+offspring-1)] <- new.id
        if (new.id==2){
          new.seq <- seq_evolve(sequence = Sequence.Results$Seq[])
          Sequence.Results$Seq <- rbind.DNAbin(t(Sequence.Results$Seq),new.seq)
        } else {
          new.seq <- seq_evolve(sequence = Sequence.Results$Seq[Results$Sequence[parent.id],])
          Sequence.Results$Seq <- rbind.DNAbin(Sequence.Results$Seq,new.seq)
        }
        Prob.Mutation[new.id] <- mutation_prob(sequence = Sequence.Results$Seq[new.id,])
        Sequence.Results$Seq.ID[new.id] <- new.id
      } else {
        Results$Sequence[child.id:(child.id+offspring-1)] <- Results$Sequence[parent.id]
      }
      
      ## update the ID number of the next child
      child.id <- child.id + offspring
    }
    
    ## Consider case when there are more offspring than susceptibles
    else {
      
      ## Cap number of offspring equal to available susceptibles
      offspring <- susceptibles
      
      ## update infection network results
      Results <- update_results(Results,offspring)
      
      ## determine if there has been a mutation event and if so evolve the sequence anc calculate new prob.mutation
      if (runif(1,0,1) > Prob.Mutation[Results$Sequence[parent.id]]){
        new.id <- max(Sequence.Results$Seq.ID)+1
        Results$Sequence[child.id:(child.id+offspring-1)] <- new.id
        
         if (new.id==2){
          new.seq <- seq_evolve(sequence = Sequence.Results$Seq[])
          Sequence.Results$Seq <- rbind.DNAbin(t(Sequence.Results$Seq),new.seq)
        } else {
          new.seq <- seq_evolve(sequence = Sequence.Results$Seq[Results$Sequence[parent.id],])
          Sequence.Results$Seq <- rbind.DNAbin(Sequence.Results$Seq,new.seq)
        }
        Prob.Mutation[new.id] <- mutation_prob(sequence = Sequence.Results$Seq[new.id,])
        Sequence.Results$Seq.ID[new.id] <- new.id
      } else {
        Results$Sequence[child.id:(child.id+offspring-1)] <- Results$Sequence[parent.id]
      }
      
      ## update the ID number of the next child
      child.id <- child.id + offspring
    }
    
    ## update the change in susceptibles 
    susceptibles <- susceptibles - offspring
    
    ## update the ID number of the next parent node to be considered
    parent.id <- parent.id + 1
    
    ## update progress bar
    #info <- sprintf("%d%% done", round((((N-susceptibles)/N)*100)))
    #setWinProgressBar(pb, round((((N-susceptibles)/N)*N)), label=info)
    
  }
  
  if (outgroup==TRUE){
    correct.parent.id <- parent.id
    parent.id <- 1
    child.id <- N+1
    offspring <- 1
    
    new.id <- max(Sequence.Results$Seq.ID)+1
    Results$Sequence[child.id:(child.id+offspring-1)] <- new.id
    
    # Update Boolean showing ID is now a node
    Results$IsFather[parent.id] <- TRUE;
    Results$Offspring[parent.id] <- offspring
    
    # Allocate children by first creating distribution of generation times
    # Timing of new infections is drawn randomly from a poisson distribution with mean Tg
    times <- max(Results$Time)
    
    # Assign the parent and generation times to the children in order
    # i.e. if node 4 & 5 are siblings, Tg[4] < Tg[5]
    Results$Parent[child.id:(child.id+offspring-1)] <- parent.id
    Results$Branch.Length[child.id:(child.id+offspring-1)] <- times
    Results$Time[child.id:(child.id+offspring-1)] <- times+Results$Time[Results$Parent[child.id:(child.id+offspring-1)]]
    
    ## seq evolve to tip
    sequence = Sequence.Results$Seq[Results$Sequence[parent.id],]
    
    # first create the transition matrix P
    if (model == "JC69"){
      P <- JC69(mu,max(Results$Time))
    } else if (model == "HKY85"){
      P <- HKY85(mu = mu, t = max(Results$Time), kappa = kappa, pi = base.freq(sequence))
    }
    # using the transition matrix create a mutation matrix given we know a mutation occurs
    mut.P <- P
    diag(mut.P) <- 0
    mut.mat <- Matrix::t(Matrix::t(mut.P)/Matrix::colSums(mut.P))
    
    # randomly find nucelotide position and mutate it accordingly
    nuc.pos <- round(runif(1,1,seq.length))
    sequence <- s2n(as.character(sequence),base4 = FALSE)
    sequence[nuc.pos] <- which(rmultinom(n=1,size=1,p=mut.mat[,sequence[nuc.pos]])==1)
    sequence <- nuc[sequence]
    
    # carry out any other mutations that may occur
    new.seq <- t(nuc[(which(apply(P[(s2n(as.character(sequence),base4 = FALSE)),], 1, rmultinom, n=1, size=1)==1)-(0:(sequence.length-1))*4)])
    
    Sequence.Results$Seq <- rbind.DNAbin(Sequence.Results$Seq,new.seq)
    
    Prob.Mutation[new.id] <- mutation_prob(sequence = Sequence.Results$Seq[new.id,])
    Sequence.Results$Seq.ID[new.id] <- new.id
    
    parent.id <- correct.parent.id
    
  }
  ## Check to see evolution as occurred!
  if(new.id==1) stop("No Evolution - Perhaps root sequence is not realistic?")
  row.names(Sequence.Results$Seq)= makeLabel(rep("seq",max(Sequence.Results$Seq.ID)),make.unique = TRUE)
  
  ## turn parent.id into a list to put into the result for the purposes of being able to figure out while tips are extant
  Parent.ID <- list(parent.id=parent.id)
  res <- append(Results,Sequence.Results)
  res <- append(res,Parent.ID)
  
  ## print times
  #close(pb)
  print(paste(N,"individuals analysed, concerning a",length(root),"long sequence within",((proc.time() - ptm)[3]),"secs"))
  
  return(res)
}