sims4: Epidemic generational simualator

#' Time explicit simulator
#' 
#'
#' This function starts with a root nucelotide sequence and evolves it via
#' an epidemic generational model. The result can then be used elsehere to 
#' generate newick trees and necessary enxus files for BEAUti/BEAST etc. 
#'
#'
#' @param fasta.file path to a fasta sequence to be used as the root, otherwise random sequence used
#' @param seq.length length of root sequence if not specified by fasta.file
#' @param N total popualation size, such that N-I = S
#' @param mu mutation root in base changes per generation time
#' @param R0 basic reproduction number
#' @param Tg mean generation time
#' @param kappa the rate of transitions and transversions  
#' @param model substitution model - avaialble so far JC69, HKY85
#' @param I Initial infected population
#' @param scale rgamma scale parameter
#' @param size rnbinom size parameter
#' @param parallel if evolution occurs between parent and all children. i.e. each sequence inherited is evolved
#' over a period of time equal to its branch length from the parent.
#' @param true_perfection always 1 mutation

#' @return Returns a list of 11, with the first 7 lists giving the full infection network. As such
#' res$ID[n] corresponds to res$Time/Parent/IsFather/Offspring/Branch.Length/Sequence/[n]. res$Seq and 
#' res$Seq.ID relate to eachother and the value within res$Sequence relates to res$Seq.ID and thus can be 
#' used to map a sequence to each individual, and parent.id gives the id of the next individual that would be 
#' considered for infection. new.id is a counter and will always be equal to 2. 
#' 
#'
#' @export
#'
#' @examples

#' 
#'


Time_Sim <- function(fasta.file=NULL,seq.length=1000,N=1000,mu=0.0001,R0=2,Tg=50,scale=1,size=1,kappa=2,
                     model="HKY85",I=1,withinhost=TRUE,fixed.offspring=FALSE,fixed.generations=FALSE,t.cont=FALSE,outgroup=FALSE,
                     parallel=FALSE,true_perfection=FALSE){
  
  
  ## HANDLE VARIABLES ##
  ##########################################################
  ptm <- proc.time()
  
  ## check declared model is valid
  model.list <- c("JC69","HKY85");
  if(!model %in% model.list ) stop("Model given is not valid")
  
  ## handle arguments and create variables
  susceptibles <- N-I;
  nuc <- as.DNAbin(c("a","c","g","t"))
  
  
  ## EXTRA FUNCTIONS ##
  ##########################################################
  
  ## function to create sequence from scratch
  seq_generate <- function(){
    res <- sample(nuc, size=seq.length, replace=TRUE)
    class(res) <- "DNAbin"
    return(res)
  }
  
  ## function to update results table
  update_results <- function(Results,offspring,parent.id){
    # Update Boolean showing ID is now a node
    Results$IsFather[parent.id] <- TRUE;
    Results$Offspring[parent.id] <- offspring
    
    # Allocate children by first creating distribution of generation times
    # Timing of new infections is drawn randomly from a poisson distribution with mean Tg
    if(fixed.generations==TRUE){
      times <- sort(rep(Tg,offspring))
    } else {
      if(t.cont==TRUE){
        times <- sort(rgamma(n=offspring,shape=Tg,scale=scale))
      } else {
        times <- sort(rpois(n=offspring,lambda=Tg))
      }
    }
    
    # Assign the parent and generation times to the children in order
    # i.e. if node 4 & 5 are siblings, Tg[4] < Tg[5]
    Results$Parent[child.id:(child.id+offspring-1)] <- parent.id
    Results$Branch.Length[child.id:(child.id+offspring-1)] <- times
    Results$Time[child.id:(child.id+offspring-1)] <- times+Results$Time[Results$Parent[child.id:(child.id+offspring-1)]]
    
    ## Sequence Evolution
    
    for (i in 1:offspring){
      ## catach the vector vs matrix issue
      if(Results$new.id==1){
        new.seq <- seq_time_evolve(sequence = Results$Seq[],time=times[i])
        gen.seq <- new.seq
        Results$Seq <- rbind.DNAbin(t(Results$Seq),new.seq)
        Results$new.id <- 2
      } else {
        if(withinhost==TRUE){
          if(i==1){
            new.seq <- seq_time_evolve(sequence = Results$Seq[parent.id,],time=times[i])
            Results$Seq <- rbind.DNAbin(Results$Seq,new.seq)
          } else {
            new.seq <- seq_time_evolve(sequence = Results$Seq[child.id-2+i,],time=(times[i]-times[i-1]))
            Results$Seq <- rbind.DNAbin(Results$Seq,new.seq)
          }
        } else {
          if(i==1){
            new.seq <- seq_time_evolve(sequence = Results$Seq[parent.id,],time=mean(times))
            gen.seq <- new.seq
          } else {
            if(parallel==TRUE){
              new.seq <- seq_time_evolve(sequence = Results$Seq[parent.id,],time=mean(times))
            } else {
              new.seq <- gen.seq  
            }
          }
          
          Results$Seq <- rbind.DNAbin(Results$Seq,new.seq)
        }
      }
    }
    
    return(Results)
    
  }
  
  ## function to evolve sequence in time explicit manner
  seq_time_evolve <- function(sequence,time){
    # first create the transition matrix P
    if (model == "JC69"){
      P <- JC69(mu,Tg)
    } else if (model == "HKY85"){
      P <- HKY85(mu = mu, t = time, kappa = kappa, pi = base.freq(sequence))
    }
    
    if(true_perfection==TRUE){
      mut.P <- P
      diag(mut.P) <- 0
      mut.mat <- Matrix::t(Matrix::t(mut.P)/Matrix::colSums(mut.P))
      
      # randomly find nucelotide position and mutate it accordingly
      nuc.pos <- round(runif(1,1,seq.length))
      sequence <- s2n(as.character(sequence),base4 = FALSE)
      if(true_perfection_outgroup==TRUE){
        sequence[nuc.pos] <- which(rmultinom(n=runif(max(case.tree$Time),1,seq.length),size=1,p=(mut.mat[,sequence[nuc.pos]]))==1)
      } else {
        sequence[nuc.pos] <- which(rmultinom(n=1,size=1,p=(mut.mat[,sequence[nuc.pos]]))==1)
      }
      sequence <- t(nuc[sequence])
      res <- sequence
    } else {
      
      res <- t(nuc[(which(apply(P[(s2n(as.character(sequence),base4 = FALSE)),], 1, rmultinom, n=1, size=1)==1)-(0:(sequence.length-1))*4)])
    }
    return(res)
  }
  
  ## MAIN FUNCTION ##
  ##########################################################
  
  ## if no sequence is provided, generate random sequence
  if(is.null(fasta.file)) {
    
    load("Data/root.RData")
    
  } else {
    ## convert given fasta file path into DNAbin root
    f <- read.fasta.format(fasta.file)
    root <- nuc[s2n(f$org)+1]
  }
  sequence.length <- length(root)
  
  ## initialise results
  if(outgroup==FALSE){
    Results <- list(ID=1:N, Time=rep(0,N), Parent=NA, IsFather = rep(FALSE,N), Offspring = NULL, 
                    Branch.Length = NULL, Sequence=1:N,Seq = root,Seq.ID=1:N,new.id=1) 
  } else {
    ## initialise results
    Results <- list(ID=1:(N+1), Time=rep(0,(N+1)), Parent=NA, IsFather = rep(FALSE,(N+1)), Offspring = NULL, 
                    Branch.Length = NULL, Sequence=1:(N+1),Seq = root,Seq.ID=1:(N+1),new.id=1) 
  }
  ## set tickers, i.e. the first infected individual 
  ## must be id #2 with parent id #1
  parent.id <- 1
  child.id <- 2
  
  
  
  true_perfection_outgroup <- FALSE
  ## create progress bar
  #pb <- winProgressBar(title="Percentage of population infected", label="0% done", min=0, max=N, initial=1)
  
  ## INITIATE LOOP ##
  ##########################################################
  
  while (susceptibles>0){
    
    ## Check to see if epidemic hasn't died out
    if(parent.id==child.id) stop("Epidemic died out!")
    
    ## Calculate the number of new infections = R0
    
    if(fixed.offspring==TRUE){
      offspring = R0
    } else {
      offspring = rnbinom(1,size=size,mu = R0) 
    }
    
    
    
    ## If the infected indivdual is the root or the last parent ensure there are new infections
    if (parent.id == 1 | (child.id-parent.id ==1)){
      while (offspring == 0){
        
        if(fixed.offspring==TRUE){
          offspring = R0
        } else {
          offspring = rnbinom(1,size=size,mu = R0) 
        }
        
      }
    }
    
    ## If the infected individual is not the root 0 new infections are possible
    if (offspring == 0) {
      Results$Offspring[parent.id] = offspring
    }
    
    ## Consider case when there are more susceptibles than offspring
    else if (offspring > 0 && offspring < susceptibles){
      
      # update infection network results
      Results <- update_results(Results=Results,offspring=offspring,parent.id=parent.id)
      
      ## update the ID number of the next child
      child.id <- child.id + offspring
    }
    
    ## Consider case when there are more offspring than susceptibles
    else {
      
      ## Cap number of offspring equal to available susceptibles
      offspring <- susceptibles
      
      # update infection network results
      Results <- update_results(Results=Results,offspring=offspring,parent.id=parent.id)
      
      ## update the ID number of the next child
      child.id <- child.id + offspring
    }
    
    ## update the change in susceptibles 
    susceptibles <- susceptibles - offspring
    
    ## update the ID number of the next parent node to be considered
    parent.id <- parent.id + 1
    
    
    ## update progress bar
    #info <- sprintf("%d%% done", round((((N-susceptibles)/N)*100)))
    #setWinProgressBar(pb, round((((N-susceptibles)/N)*N)), label=info)
    
  }
  
  
  if (outgroup==TRUE){
    
    true_perfection_outgroup <- TRUE
    correct.parent.id <- parent.id
    parent.id <- 1
    child.id <- N+1
    offspring <- 1
    
    times <- max(Results$Time)
    Results$Parent[child.id:(child.id+offspring-1)] <- parent.id
    Results$Branch.Length[child.id:(child.id+offspring-1)] <- times
    Results$Time[child.id:(child.id+offspring-1)] <- times+Results$Time[Results$Parent[child.id:(child.id+offspring-1)]]
    
    ## Sequence Evolution
    
    new.seq <- seq_time_evolve(sequence = Results$Seq[parent.id,],time=times)
    Results$Seq <- rbind.DNAbin(Results$Seq,new.seq)
    
    parent.id <- correct.parent.id
    
  }
  
  
  row.names(Results$Seq)= makeLabel(rep("seq",max(Results$Seq.ID)),make.unique = TRUE)
  
  ## turn parent.id into a list to put into the result for the purposes of being able to figure out while tips are extant
  Parent.ID <- list(parent.id=parent.id)
  res <- append(Results,Parent.ID)
  
  ## print times
  #close(pb)
  print(paste(N,"individuals analysed, concerning a",length(root),"long sequence within",((proc.time() - ptm)[3]),"secs"))
  
  return(res)
}