R/network-tracer-accumlation.R

#' Network_Pull_Tracer
#' 
#' Summary Statistics are clock rate, growthRate, ePop and TMRCA. 
#' 
#' This slightly brute force function searches for all directories under a parent directory, 
#' assuming they were created with BEAST_R0_Check or similar script, (see script for possible 
#' differences in directories and naming conventions for tracer exports) and collates the necessary summary statistics into one table.
#' 
#'
#'
#' @param reps replicates total. Default = 10
#' @param dirs number of subdirectories to be scanned
#' @param vars variables total. Default = 5
#' @param var.R0 boolean for whether Generation_Sim or Broken was used in data creation. Default=TRUE 
#' @param N.v column vector of N. Default: N.v=c(rep(10000,reps)),
#' @param mu.v column vector of mutation rates in directory. Default: mu.v=c(1e-04, rep(0.001,3), rep(1e-04,3), rep(1e-05,3)),
#' @param Tg.v column vector of generation times in directory. Default: Tg.v=c(1,rep(3,9)),
#' @param R0.v column vector of R0 in directory. Default: R0.v=c(2,rep(c(2,5,20),3)),
#' @param root.dir - root directory, omitting a final /

#' @return Returns a data frame of collated statstics 
#' 
#' 
#' @export
#' 
#'

Network_Pull_Tracer <- function(reps=1,
                                dirs=seq(from=1,to=19.5,by=0.5),
                                vars=5,
                                var.R0=TRUE,
                                N.v=c(rep(10000,length(dirs))),
                                mu.v=c(rep(1e-04,length(dirs))),
                                Tg.v=c(rep(50,length(dirs))),
                                R0.v=c(rep(2.4,length(dirs))),
                                network.root.dir="LSD4\\mu_1e-04_R0_2.4_Tg_50\\",
                                local.root.dir="LSD4/mu_1e-04_R0_2.4_Tg_50/"){
  
  ## WARNINGS ##  
  
  ## AUXILLARY FUNCTIONS ##
  nex_read <- function(nex.file, case.tree){
    
    # pull nexus file and isolate the names for meta data
    nex <- read.nexus.data(nex.file)
    nex.names <- names(nex)
    split.names <- strsplit(nex.names,split = "_")
    
    # set up results to store from each split
    Seq.Res <- vector(mode="character",length=length(split.names))
    Time.Res <- vector(mode="character",length=length(split.names))
    
    for (i in 1:length(split.names)){
      
      Seq.Res[i] <- split.names[[i]][2]
      Time.Res[i] <- split.names[[i]][3]
      
    }
    
    # calculate needed statistics before returning
    Sample.Size <- length(Seq.Res)
    Sequence.Count <- length(unique(Seq.Res))
    Time.Span <- max(as.numeric(Time.Res))-min(as.numeric(Time.Res))
    Youngest.Sample <- max(case.tree$Time)-max(as.numeric(Time.Res))
    Oldest.Sample <- min(as.numeric(Time.Res))
    Percent.Cover <- Time.Span/max(case.tree$Time)
    
    res <- c(Sample.Size,Sequence.Count,Time.Span, Youngest.Sample,Oldest.Sample, Percent.Cover)
    return(res)
  }
  
  RMSE_calculation <- function(Full.Table){
    
    l <- rep(0,dim(Full.Table)[1])
    Errors <- list("RMedSE_mu"=l,"RMedSE_Gr"=l,"RMedSE_ePop"=l,"RMedSE_TMRCA"=l,"RMeanSE_mu"=l,"RMeanSE_Gr"=l,"RMeanSE_ePop"=l,"RMeanSE_TMRCA"=l)
    
    Errors$RMedSE_mu <- sqrt((Full.Table$O_Mu_Med - Full.Table$E_Mu)^2)
    Errors$RMedSE_Gr <- sqrt((Full.Table$O_Gr_Med - Full.Table$E_Gr)^2)
    Errors$RMedSE_ePop <- sqrt((Full.Table$O_ePop_Med - Full.Table$E_ePop)^2)
    Errors$RMedSE_TMRCA <- sqrt((Full.Table$O_TMRCA_Med - Full.Table$E_TMRCA)^2)
    Errors$RMeanSE_mu <- sqrt((Full.Table$O_Mu_Mean - Full.Table$E_Mu)^2)
    Errors$RMeanSE_Gr <- sqrt((Full.Table$O_Gr_Mean - Full.Table$E_Gr)^2)
    Errors$RMeanSE_ePop <- sqrt((Full.Table$O_ePop_Mean - Full.Table$E_ePop)^2)
    Errors$RMeanSE_TMRCA <- sqrt((Full.Table$O_TMRCA_Mean - Full.Table$E_TMRCA)^2)
    res <- cbind(Full.Table,data.frame(Errors))
    
    return(res)
  }
  
  ## Handle network path
  
  network.dir <- paste("\\\\fi--san02.dide.ic.ac.uk\\homes\\olw13\\Data\\",network.root.dir,sep="")
  root.dir <- paste("C:/Users/Oliver/Google Drive/Academic Work/Imperial/O15-12/BEAST/Data Collection/",local.root.dir,sep="")
  
  for (i in 1:length(dirs)){
    
    ## create file and directory locations from the given root.dir
    case.file <- paste(root.dir,"/casetree.RData",sep="")
    summary.file <- paste(root.dir,"sigma",dirs[i],"/summary.txt",sep="")
    nex.dir <- paste(root.dir,"sigma",dirs[i],"/",sep="")
    load(case.file)
    
    ## load the summary statistic file exported from TRACER
    if(file.exists(paste(nex.dir,"summary.txt",sep=""))==FALSE){
      Tracer_Addon(file.dir=nex.dir,log.dir=paste(network.dir,"sigma",dirs[i],"\\",sep=""),reps=reps,replicate.name = paste("sigma",dirs[i],sep=""))
    }
    all <- read.delim(summary.file, row.names=1,stringsAsFactors = FALSE)
    
    ## create results
    Result <- list("Var"=rep(var.R0,reps),"Rep"=1:reps,"Tg"=rep(Tg.v[i],reps),"E_Mu"=rep(mu.v[i],reps), "O_Mu_Med"=as.numeric(all["median",seq(3,vars*(reps-1)+3,vars)]),
                   "O_Mu_Mean"=as.numeric(all["mean",seq(3,vars*(reps-1)+3,vars)]),"O_Mu_StdErr"=as.numeric(all["stdErr",seq(3,vars*(reps-1)+3,vars)]),
                   "ESS_mu"=as.numeric(all["effective sample size (ESS)",seq(3,vars*(reps-1)+3,vars)]),
                   "O_Mu_95L"=as.numeric(all["95% HPD Interval lower",seq(3,vars*(reps-1)+3,vars)]),
                   "O_Mu_95H"=as.numeric(all["95% HPD Interval higher",seq(3,vars*(reps-1)+3,vars)]),
                   "E_Gr"=rep((log(R0.v[i])/Tg.v[i]),reps), "O_Gr_Med"=as.numeric(all["median",seq(5,vars*(reps-1)+5,vars)]),
                   "O_Gr_Mean"=as.numeric(all["mean",seq(5,vars*(reps-1)+5,vars)]),"O_Gr_StdErr"=as.numeric(all["stdErr",seq(5,vars*(reps-1)+5,vars)]),
                   "ESS_Gr"=as.numeric(all["effective sample size (ESS)",seq(5,vars*(reps-1)+5,vars)]),
                   "O_Gr_95L"=as.numeric(all["95% HPD Interval lower",seq(5,vars*(reps-1)+5,vars)]),
                   "O_Gr_95H"=as.numeric(all["95% HPD Interval higher",seq(5,vars*(reps-1)+5,vars)]),
                   "E_ePop"=rep(N.v[i],reps), "O_ePop_Med"=as.numeric(all["median",seq(4,vars*(reps-1)+4,vars)]),
                   "O_ePop_Mean"=as.numeric(all["mean",seq(4,vars*(reps-1)+4,vars)]),"O_ePop_StdErr"=as.numeric(all["stdErr",seq(4,vars*(reps-1)+4,vars)]),
                   "ESS_ePop"=as.numeric(all["effective sample size (ESS)",seq(4,vars*(reps-1)+4,vars)]),
                   "O_ePop_95L"=as.numeric(all["95% HPD Interval lower",seq(4,vars*(reps-1)+4,vars)]),
                   "O_ePop_95H"=as.numeric(all["95% HPD Interval higher",seq(4,vars*(reps-1)+4,vars)]),
                   "E_TMRCA"=rep(max(case.tree$Time),reps), "O_TMRCA_Med"=as.numeric(all["median",seq(2,vars*(reps-1)+2,vars)]),
                   "O_TMRCA_Mean"=as.numeric(all["mean",seq(2,vars*(reps-1)+2,vars)]),"O_TMRCA_StdErr"=as.numeric(all["stdErr",seq(2,vars*(reps-1)+2,vars)]),
                   "ESS_TMRCA"=as.numeric(all["effective sample size (ESS)",seq(2,vars*(reps-1)+2,vars)]),
                   "O_TMRCA_95L"=as.numeric(all["95% HPD Interval lower",seq(2,vars*(reps-1)+2,vars)]),
                   "O_TMRCA_95H"=as.numeric(all["95% HPD Interval higher",seq(2,vars*(reps-1)+2,vars)]))
                   
    
    ## collate results before returning
    if(i==1){
      Full.Table <- data.frame(Result)
    } else {
      Full.Table <- rbind(Full.Table, data.frame(Result))
    }
    
  }
  res <- RMSE_calculation(Full.Table = Full.Table)
  return(res)
}
OJWatson/sims4 documentation built on May 7, 2019, 8:33 p.m.