R/all_functions.R

Defines functions runTBclustr modelstats plotclusterdist fitLogNormalmodel bootstrapmodel drawfitfigure examineposteriors

runTBclustr = function(filename){
  rc=read.csv(filename)
  
  plotclusterdist(rc) # plot the histogram of clustersizes for your data
  
  ABC_output<-fitLogNormalmodel(rc)
  
  mod1<-bootstrapmodel(ABC_output,rc)
  
  examineposteriors(ABC_output)
}



modelstats = function(pars)
{
  # this function calls the branching process function 
  # (rcpp_clusterfunction) with the correct parameters (stored in pars) 
  # and generates a vector of cluster sizes, where each entry 
  # represents the size of a cluster generated by the branching process. 
  
  mu=pars[1]
  sigma=pars[2]
  overlap=pars[3]
  numberclusters=pars[4]
  maxclustersize=pars[5] #max(rc$x=204)
  mylseq=unique(floor(exp(seq(log(1),log(maxclustersize),length.out=50))))
  counts=rep(0,length(mylseq))
  
  rzero = exp(mu + 0.5*sigma^2)
  if (rzero>=1){return(counts)}
  
  clustersout=rep(0,numberclusters)
  rcpp_clusterfunction(numberclusters,mu,sigma,overlap,clustersout)
  
  #no longer using c1$clustersizes
  if(sum(is.na(clustersout))>0){return(counts)}
  if(sum(is.infinite(clustersout))>0){return(counts)}
  
  #log bin cluster sizes 
  
  modelclusters = clustersout
  
  for(i in 1:(length(mylseq)-1))
  {
    counts[i] = sum(modelclusters>=mylseq[i] & modelclusters<mylseq[i+1])
  }				
  
  return(counts)
}

#test:
# modelstats(c(-2,0.1,0.1,length(rc$x),300))


plotclusterdist = function(rc){
  
  require('dplyr')
  require('ggplot2')
  
  rc%>%ggplot() + 
    geom_point(aes(x=x, y=..count..), stat="bin",bins=20,shape=15,colour = "black", fill = "white", size = 5, stroke = 0.1)+
    scale_x_log10()+
    scale_y_log10()+
    xlab('Size of cluster (s)')+
    ylab('Number of clusters of size s')
}


fitLogNormalmodel = function(rc){
  # this function takes the data (as a data frame) containing a single list of cluster sizes
  # and fits the model specified in modelstats above to the data using EasyABC
  
  #set the list of parameter priors, required by EasyABC
  
  require("EasyABC") # make sure that EasyABC is installed and loaded
  
  set.seed(5) # set random number generator seed - needed??
  
  # find the maximum of the cluster sizes and set maxclustersize to something bigger (for binning)
  
  maxclustersize=floor(max(rc$x)*1.5)#this is somewhat arbitrary
  
  #log bin data - the output datacounts is required for fitting to.
  mylseq=unique(floor(exp(seq(log(1),log(maxclustersize),length.out=50))))
  datacounts=rep(0,length(mylseq))
  for(i in 1:(length(mylseq)-1))
  {
    datacounts[i] = sum(rc$x>=mylseq[i] & rc$x<mylseq[i+1])
  }  	
  # plot(mylseq,datacounts,log="xy",xlab="Cluster size",ylab="Frequency")
  
  #define priors (last two variables are fixed)
  priors=list(c("unif",-10,0),c("unif",0,10),c("unif",0,0.5),c("unif",length(rc),length(rc)),c("unif",300,300))
  
  #set criteria for parameters (rzero<1)
  prior_test=c("X1<=0","X2>0","X3>0","X3<1","(X2^2)<(-2*X1)")
  
  #test ABC code with the rejection algorithm - this runs quickly to check it works, for debugging code and data.
  p=0.1
  n=10000
  ABC_rej<-ABC_rejection(model=modelstats, prior=priors, nb_simul=n, summary_stat_target=datacounts, tol=p,prior_test=prior_test)
  
  #now run for real with Marjoram MCMC algorithm (may take several hours on a slower machine)
  ABC_plognorm15<-ABC_mcmc(method="Marjoram", model=modelstats,progress_bar=T, 
                           prior=priors, summary_stat_target=datacounts, prior_test=prior_test,n_rec=100000)
  
  #ABC_plognorm15 = ABC_rej
  save(ABC_plognorm15,file="MCMCplognormfit.RData") # save output of ABC runs to file
  return(ABC_plognorm15)
}


bootstrapmodel = function(ABC_plognorm15,rc){
  maxclustersize=floor(max(rc$x)*1.5)#this is somewhat arbitrary
  mylseq=unique(floor(exp(seq(log(1),log(maxclustersize),length.out=50))))
  # datacounts=rep(0,length(mylseq))
  # for(i in 1:(length(mylseq)-1))
  # {
  #   datacounts[i] = sum(rc$x>=mylseq[i] & rc$x<mylseq[i+1])
  # }  	
  # load(file="MCMCplognormfit.RData")
  
  
  N1=length(ABC_plognorm15$param[,1])
  myboot = 1000
  mod1=matrix(0,ncol=myboot,nrow=length(mylseq))
  clustersout = rep(0,length(rc$x))
  for(i in 1:myboot)
  {
    myrand=floor(runif(1,1,N1+1))
    rcpp_clusterfunction(length(rc$x),ABC_plognorm15$param[myrand,1],ABC_plognorm15$param[myrand,2],ABC_plognorm15$param[myrand,3],clustersout)
    
    #log bin model output as data
    modelcounts=rep(0,length(mylseq))
    for(j in 1:(length(mylseq)-1)){modelcounts[j] = sum(clustersout>=mylseq[j] & clustersout<mylseq[j+1])}	
    
    mod1[,i] = modelcounts
  }
  return(mod1)
}

drawfitfigure = function(rc,mod1){
  maxclustersize=floor(max(rc$x)*1.5)#this is somewhat arbitrary
  mylseq=unique(floor(exp(seq(log(1),log(maxclustersize),length.out=50))))
  datacounts=rep(0,length(mylseq))
  for(i in 1:(length(mylseq)-1))
  {
    datacounts[i] = sum(rc$x>=mylseq[i] & rc$x<mylseq[i+1])
  }  	
  #draw results figure
  #pdf("figs/plognormmodelfit2_15.pdf")
  par(mar=c(5,5,1,1))
  matplot(mylseq,mod1,pch=19,log="xy",col=rgb(1,1,0,0.1),xlab="Cluster size",ylab="Frequency",cex.lab=1.7,cex.axis=1.5,cex=1)
  points(mylseq,datacounts,pch=22,col="red",cex=1.5,bg="red")
  legend('topright',c("Data","Model"),pch=c(22,19),col=c("red",rgb(1,1,0)),cex=1.5,inset=0.01,pt.bg="red",pt.cex=c(2.5,2.5))
  #dev.off()
}

examineposteriors = function(ABC_plognorm15){
  #now examine posteriors
  p1=ABC_plognorm15$param[,1]
  p2=ABC_plognorm15$param[,2]
  p3=ABC_plognorm15$param[,3]
  
  
  # calculate R0 and confidence intervals
  rzero = exp(p1 + 0.5*p2^2)
  rmed=sort(rzero)[floor(length(p1)*(0.5))]
  rlo=sort(rzero)[floor(length(p1)*(0.025))]
  rhi=sort(rzero)[floor(length(p1)*(0.975))]
  mean(rzero)
  print(paste("Median R0 =",round(rmed,3),"95%CI = (",round(rlo,3),",",round(rhi,3),")"))
  #pdf("figs/plognormrzero15.pdf")
  par(mar=c(5,5,1,1))
  hist(rzero,col="purple",xlab="Basic reproduction number",main="")
  box()
  #dev.off()
  
  uktransmitted = rzero/(1+rzero+p3)
  ukmed=sort(uktransmitted)[floor(length(p1)*(0.5))]
  uklo=sort(uktransmitted)[floor(length(p1)*(0.025))]
  ukhi=sort(uktransmitted)[floor(length(p1)*(0.975))]
  mean(uktransmitted)
  print(paste("Median % recent transmission =",round(100*ukmed,1),"% 95%CI = (",round(100*uklo,1),",",round(100*ukhi,1),")"))
  
  
  #pdf("figs/plognorm_imported15.pdf")
  par(mar=c(5,5,1,1))
  hist(1-uktransmitted,col=rgb(0,0.5,0.7),xlab="Fraction of cases imported",main="")
  box()
  #dev.off()
}  
ldanon/TBclustr documentation built on May 14, 2019, 10:35 a.m.