sims4: Epidemic generational simualator

#' Spatial Generation epidemic genetic simulator
#'
#' This function starts with a root nucelotide sequence and evolves it via
#' a spatial  epidemic generational model. The result can then be used elsehere to 
#' generate newick trees and necessary enxus files for BEAUti/BEAST etc 
#'
#'
#' @param fasta.file path to a fasta sequence to be used as the root, otherwise random sequence used
#' @param seq.length length of root sequence if not specified by fasta.file
#' @param N - total popualation size, such that N-I = S
#' @param mu - mutation root in base changes per generation time
#' @param R0 - basic reproduction number
#' @param Tg - mean generation time
#' @param kappa - the rate of transitions and transversions  
#' @param model - substitution model - avaialble so far JC69, HKY85
#' @param I - Initial infected space
#' @param samples - 1/samples is equal to the random sample size of tips stored 
#'
#' @return Returns a list of 9, with the first 7 lists giving the full infection network. As such
#' res$ID[n] corresponds to res$Time/Parent/IsFather/Offspring/Branch.Length/Sequence/[n]. res$Seq and 
#' res$Seq.ID relate to eachother and the value within res$Sequence relates to res$Seq.ID and thus can be 
#' used to map a sequence to each individual 
#' 
#' 
#' @export
#'
#' @examples
#' ## simulate data
#' TODO(OJ)
#' 
#'

Spatial_Generation_Sim <- function(fasta.file=NULL,seq.length=1000,spaces=3,Nv=rep(100,spaces),
                                   dm=matrix(c(1, 1/100, 1/10000, 1/100, 1, 1/100, 1/10000, 1/100, 1), nrow=3, ncol=3),
                                   mu=0.0001,R0=3,Tg=2,kappa=2,model="HKY85",I=1,samples=10){
  
  ## HANDLE VARIABLES ##
  ##########################################################
  ptm <- proc.time()
  
  ## check declared model is valid
  model.list <- c("JC69","K80","HKY85");
  if(!model %in% model.list ) stop("Model given is not valid")
  
  ## handle arguments and create variables
  ##### ADJUST SUSCEPTIBLES
  Sv <- Nv
  Sv[I] <- Sv[I]-1
  Sv <- matrix(data = Sv, nrow = 1,ncol = 3)
  Iv <- rep(0,spaces)
  Iv[I] <- 1
  Iv <- matrix(data = Iv, nrow = 1,ncol = 3)
  N <- sum(Nv)
  nuc <- as.DNAbin(c("a","c","g","t"))

  
  
  ## EXTRA FUNCTIONS ##
  ##########################################################
  
  ## function to create sequence from scratch
  seq_generate <- function(){
    res <- sample(nuc, size=seq.length, replace=TRUE)
    class(res) <- "DNAbin"
    return(res)
  }
  
  ## function to update results table
  update_results <- function(Results,offspring){
    # Update Boolean showing ID is now a node
    Results$IsFather[parent.id] <- TRUE;
    Results$Offspring[parent.id] <- offspring
    
    # Allocate children by first creating distribution of generation times
    # Timing of new infections is drawn randomly from a poisson distribution with mean Tg
    times <- sort(rpois(offspring,Tg))
    
    ######################################
    ## CALCULATE MATRIX AND WORK OUT LOCATION OF INFECTION
    location.matrix <- cumsum((t((dm*(tail(Sv,1)[1,])))*(tail(Iv,1)[1,]))[Results$Space[parent.id],])/max(cumsum((t((dm*(tail(Sv,1)[1,])))*(tail(Iv,1)[1,]))[Results$Space[parent.id],]))
    location <- runif(1,0,1) >= location.matrix
    inf.location <- min(which(location==FALSE))
    ## Assign location of children
    
    # Assign the parent and generation times to the children in order
    # i.e. if node 4 & 5 are siblings, Tg[4] < Tg[5]
    Results$Parent[child.id:(child.id+offspring-1)] <- parent.id
    Results$Space[child.id:(child.id+offspring-1)] <- inf.location
    Results$Branch.Length[child.id:(child.id+offspring-1)] <- times
    Results$Time[child.id:(child.id+offspring-1)] <- times+Results$Time[Results$Parent[child.id:(child.id+offspring-1)]]
    return(Results)
    
  }
  
  ## function to evaluate probability of mutation event
  mutation_prob <- function(sequence){
    # first assign the class of DNAbin to sequence (funny)
    class(sequence)<-"DNAbin"
    # second create the transition matrix P
    if (model == "JC69"){
      P <- JC69(mu,Tg)
    } else if (model == "HKY85"){
      P <- HKY85(mu, t = Tg, kappa=kappa, pi=base.freq(sequence))
    } else if (model == "K80") {
      P <- K80(mu, t = Tg, kappa=kappa)
    }
    # using the transition matrix and the absolute numbers of nucelotides caluclate the probability
    # that no mutations occur
    abs.freq <- base.freq(sequence,freq=TRUE)
    res <- (P[1,1])^abs.freq[1]*(P[2,2])^abs.freq[2]*(P[3,3])^abs.freq[3]*(P[4,4])^abs.freq[4]
    return(res)
  }
  
  ## function to evolve sequence given succesful mutation event
  seq_evolve <- function(sequence){
    # first create the transition matrix P
    if (model == "JC69"){
      P <- JC69(mu,Tg)
    } else if (model == "HKY85"){
      P <- HKY85(mu = mu, t = Tg, kappa = kappa, pi = base.freq(sequence))
    } else if (model =="K80") {
      P <- K80(mu, t = Tg, kappa=kappa)
    }
    # using the transition matrix create a mutation matrix given we know a mutation occurs
    mut.P <- P
    diag(mut.P) <- 0
    mut.mat <- Matrix::t(Matrix::t(mut.P)/Matrix::colSums(mut.P))
    
    # randomly find nucelotide position and mutate it accordingly
    nuc.pos <- round(runif(1,1,seq.length))
    sequence <- s2n(as.character(sequence),base4 = FALSE)
    sequence[nuc.pos] <- which(rmultinom(n=1,size=1,p=mut.mat[,sequence[nuc.pos]])==1)
    sequence <- nuc[sequence]
    
    # carry out any other mutations that may occur
    res <- t(nuc[(which(apply(P[(s2n(as.character(sequence),base4 = FALSE)),], 1, rmultinom, n=1, size=1)==1)-(0:(sequence.length-1))*4)])
    
    return(res)
  }
  
  ## MAIN FUNCTION ##
  ##########################################################
  
  ## if no sequence is provided, generate random sequence
  if(is.null(fasta.file)) {
    root <- seq_generate()
  } else {
    ## convert given fasta file path into DNAbin root
    root <- read.FASTA(fasta.file)
  }
  sequence.length <- length(root)
  
  ## initialise results
  ######################################
  Results <- list(ID=1:N, Time=rep(0,N), Parent=NA, IsFather = rep(FALSE,N), Offspring = NULL,
                  Branch.Length = NULL, Sequence=1, Space = rep(0,N)) 
  ## Assign location of root
  Results$Space[1] <- I
  Sequence.Results <- list(Seq = root,Seq.ID=1)
  
  ## set tickers, i.e. the first infected individual 
  ## must be id #2 with parent id #1
  parent.id <- 1
  child.id <- 2
  new.id <- 1
  ## create starting mutation probability
  Prob.Mutation <- mutation_prob(sequence=root)
  
  ## create progress bar
  pb <- winProgressBar(title="Percentage of population infected", label="0% done", min=0, max=N, initial=1)
  
  ## INITIATE LOOP ##
  ##########################################################
  
  while (sum(tail(Sv,1))>0){
    
    ## Check to see if epidemic hasn't died out
    if(parent.id==child.id) stop("Epidemic died out!")
    
    ## Calculate the number of new infections, randomly choosing from a negative binomial with mean R0
    offspring = rnbinom(1,1,mu = R0)
    
    ## If the infected indivdual is the root or the last parent ensure there are new infections
    if (parent.id == 1 | (child.id-parent.id ==1)){
      while (offspring == 0){
        offspring = rnbinom(1,1,mu = R0)
      }
    }

    ## first check what the smallest non zero susceptible count is
    if(min(tail(Sv,1))==0){
      snzs <- min(tail(Sv,1)[tail(Sv,1)>0])
    } else {
      snzs <- min(tail(Sv,1))
    }
    
    
    ## If the infected individual is not the root 0 new infections are possible
    if (offspring == 0) {
      Results$Offspring[parent.id] = offspring
    }
    ## Consider case when there are more susceptibles than offspring
    else if (offspring > 0 && offspring < snzs){
      
      # update infection network results
      Results <- update_results(Results,offspring)
      
      # determine if there has been a mutation event and if so evolve the sequence and calculate new prob.mutation
      if (runif(1,0,1) > Prob.Mutation[Results$Sequence[parent.id]]){
        new.id <- max(Sequence.Results$Seq.ID)+1
        Results$Sequence[child.id:(child.id+offspring-1)] <- new.id
        if (new.id==2){
          new.seq <- seq_evolve(sequence = Sequence.Results$Seq[])
          Sequence.Results$Seq <- rbind.DNAbin(t(Sequence.Results$Seq),new.seq)
        } else {
          new.seq <- seq_evolve(sequence = Sequence.Results$Seq[Results$Sequence[parent.id],])
          Sequence.Results$Seq <- rbind.DNAbin(Sequence.Results$Seq,new.seq)
        }
        Prob.Mutation[new.id] <- mutation_prob(sequence = Sequence.Results$Seq[new.id,])
        Sequence.Results$Seq.ID[new.id] <- new.id
      } else {
        Results$Sequence[child.id:(child.id+offspring-1)] <- Results$Sequence[parent.id]
      }
      
      ## update the ID number of the next child
      child.id <- child.id + offspring
    }
    
    ## Consider case when there are more offspring than susceptibles
    else {
      
      ## Cap number of offspring equal to available susceptibles
      offspring <- snzs
      
      ## update infection network results
      Results <- update_results(Results,offspring)
      
      ## determine if there has been a mutation event and if so evolve the sequence anc calculate new prob.mutation
      if (runif(1,0,1) > Prob.Mutation[Results$Sequence[parent.id]]){
        new.id <- max(Sequence.Results$Seq.ID)+1
        Results$Sequence[child.id:(child.id+offspring-1)] <- new.id
        
        if (new.id==2){
          new.seq <- seq_evolve(sequence = Sequence.Results$Seq[])
          Sequence.Results$Seq <- rbind.DNAbin(t(Sequence.Results$Seq),new.seq)
        } else {
          new.seq <- seq_evolve(sequence = Sequence.Results$Seq[Results$Sequence[parent.id],])
          Sequence.Results$Seq <- rbind.DNAbin(Sequence.Results$Seq,new.seq)
        }
        Prob.Mutation[new.id] <- mutation_prob(sequence = Sequence.Results$Seq[new.id,])
        Sequence.Results$Seq.ID[new.id] <- new.id
      } else {
        Results$Sequence[child.id:(child.id+offspring-1)] <- Results$Sequence[parent.id]
      }
      
      ## update the ID number of the next child
      child.id <- child.id + offspring
    }
    
    ## update the change in susceptibles 
    #################CHANGE HERE TO Offspring * c(1,0,0) from location section
    if(offspring > 0){
    change <- rep(0,spaces)
    change[Results$Space[max(which(Results$Space>0))]] <- 1*offspring
    
    #if(parent.id==1){
    #  Sv <- rbind(Sv,Sv-Change)
    #  Iv <- rbind(Iv,Iv+Change)
    #} else {
    Sv <- rbind(Sv,tail(Sv,1)-change)
    Iv <- rbind(Iv,tail(Iv,1)+change)
    #}
    }
    ## update the ID number of the next parent node to be considered
    parent.id <- parent.id + 1
    
    ## update progress bar
    info <- sprintf("%d%% done", round((((N-sum(tail(Sv,1)))/N)*100)))
    setWinProgressBar(pb, round((((N-sum(tail(Sv,1)))/N)*N)), label=info)
    
  }
  ## Check to see evolution as occurred!
  if(new.id==1) stop("No Evolution - Perhaps root sequence is not realistic?")
  row.names(Sequence.Results$Seq)= makeLabel(rep("seq",max(Sequence.Results$Seq.ID)),make.unique = TRUE)
  
  ## turn parent.id into a list to put into the result for the purposes of being able to figure out while tips are extant
  Parent.ID <- list(parent.id=parent.id)
  res <- append(Results,Sequence.Results)
  res <- append(res,Parent.ID)
  
  ## print times
  close(pb)
  print(paste(N,"individuals analysed, concerning a",length(root),"long sequence within",((proc.time() - ptm)[3]),"secs"))
  
  return(res)
}