knitr::opts_chunk$set(echo = TRUE)
library(fitsir)
library(bbmle) ## needed for coef() ...
library(splines) ## for ns()
library(plyr)  ## for raply()
library(dplyr)
library(tibble)
library(tidyr)
library(ggplot2); theme_set(theme_bw())
library(gridExtra)
library(ggstance) ## for horizontal violins
source("stochsim_funs.R")
set.seed(101)
s0 <- simfun(rpars=list(size=10))

Various ways of fitting ...

## get starting values and trajectory based on them
## ss0 <- startfun(auto=TRUE,data=s0)
## use the generic (bad!) starting values
ss0 <- startfun()
ss2 <- SIR.detsim(s0$tvec,unlist(trans.pars(ss0)))

## fit and corresponding trajectory
t1 <- system.time(f1 <- fitsir(s0,start=ss0))
ss3 <- SIR.detsim(s0$tvec,trans.pars(coef(f1)))

## GAM fit: match number of degrees of freedom
## glm() lists df as df+2 

##   counting for intercept and residual var(?)
m1 <- glm(count~ns(tvec,df=3),
          family=gaussian(link="log"),data=s0)

fitsir works OK in this case even though we use the generic starting parameters (not auto-fit, which is currently broken):

par(las=1,bty="l")
plot(count~tvec,data=s0)
lines(s0$tvec,ss2)  ## incidence/prevalence mismatch?
lines(s0$tvec,ss3,col=2)  ## decent fit anyway
lines(s0$tvec,predict(m1,type="response"),col=4)
legend("topright",
       c("start","fitsir","spline"),
       col=c(1,2,4),lty=1)

Preliminary ensemble results

## takes about 1 minute per sim ...
system.time(print(fitfun(s0)))

(autostart was used with these cached results ...)

simfn <- "stochsim.rda"
if (file.exists(simfn)) {
   load(simfn)
} else {
set.seed(101)
res1 <- raply(20,fitfun(simfun(rpars=list(size=10))),
              .progress="text")
}

How well does this work?

par(las=1,bty="l")
with(res1,hist(nll.SIR-nll.gam,col="gray",breaks=20,
               main=""))

Notes/preliminary conclusions

fitfun(s0)[c("nll.SIR","nll.gam")]
fitfun2(s0,plot.it=TRUE,log="y")

Note that in this case fitsir has lower mean-squared-error, despite having a higher negative log-likelihood. This makes some sense because fitsir appears better on the log scale ...

sizevec <- 10^seq(0,2,length.out=20)
repvals <- 1:10
sres <- data.frame(expand.grid(size=sizevec,rep=repvals),
                   cbind(fitsir=NA,spline=NA))
set.seed(101)
for (i in 1:nrow(sres)) {
    sres[i,3:4] <- 
        fitfun2(simfun(rpars=list(size=sres[i,"size"])))
}
sres2 <- gather(sres,method,mse,-size,-rep)
ggplot(sres2,aes(size,mse,colour=method))+
    scale_x_log10()+
    scale_y_log10()+
    geom_point()+geom_smooth()+scale_colour_brewer(palette="Set1")

We may need pretty small size values to get match the observed range of mean-squared error values ...

A first crack at comparing fitsir and spline (three ways to do this -- (1) true start values; (2) generic start values; (3) LHS of starting values, pick best)

simfn2 <- "stochsim2.rda"
fitfun.optim <- function(data) {
    t1 <- system.time(f1 <- fitsir.optim(data,start=startfun(auto=TRUE,data=data)))
    m1 <- glm(count~ns(tvec,df=3),family=gaussian(link="log"),data=data)
    res <- c(t=unname(t1["elapsed"]),coef(f1),
      nll.SIR=c(-logLik(f1)),nll.gam=c(-logLik(m1)))
    return(res)
}

if (file.exists(simfn2)) {
   load(simfn2)
} else {
    set.seed(101)
    res1 <- raply(20,fitfun(simfun(rpars=list(size=10))))
    save("res1",file=simfn2)
}

Ranges

from Dora:

Nquant <- setNames(
    c(9.908985e+00,4.492488e+02,2.096015e+03,2.878854e+04,2.865343e+31),
    seq(0,100,by=25))
I0quant <-  setNames(
    c(6.573213e-193,6.255714e-04,9.216386e-03,4.933392e-02,9.990980e-01),
    seq(0,100,by=25))

m0 <- matrix(c(1.16,1.66,3.85,3.98,7.68,15.08,0.16,0.36,0.57,0.07,0.13,0.25,
         1.12,1.34,2.11,3.61,6.85,11.07,0.15,0.28,0.55,0.09,0.14,0.28,
         1.26,2.27,5.13,3.90,7.19,11.49,0.26,0.47,1.08,0.09,0.14,0.26,
         1.24,1.88,8.60,4.71,12.97,42.06,0.16,0.31,0.73,0.02,0.08,0.21),
         ncol=4)
dimnames(m0) <- list(paste(rep(c("R0","1/gamma","beta","gamma"),each=3),
                     rep(paste0("Q",1:3),4),sep="_"),
                     c("GB","BR","FI","ID"))
m1 <- m0 %>% as.data.frame %>% rownames_to_column("var") %>%
    separate(var,c("var","quantile"),sep="_") %>%
    gather(country,value,-c(var,quantile))
## mutate(qq=as.numeric(gsub("Q","",quantile))*0.25)
ggplot(m1,aes(country,value))+geom_point()+
    facet_wrap(~var,scale="free")

comparing smooth.spline and lm(y~ns())

How do we fit a reasonable spline model of equivalent complexity to fitsir (with 4 total parameters)?

?ns says

One can supply ‘df’ rather than knots; ‘ns()’ then chooses ‘df - 1 - intercept’ knots ...

however, the df refers to the number of spline degrees of freedom; if we set df=3 we get 4 parameters (intercept+3). smooth.spline with $k$ knots gives $k+2$ coefficients (but it's almost impossible to make it work with $k=2$ knots \ldots)

set.seed(101)
dx <- data.frame(x=rnorm(100),y=rnorm(100))
length(coef(lm(y~ns(x,df=3),data=dx)))
ss <- with(dx,smooth.spline(x,y,nknots=4))
length(ss$fit$coef)
bombay2 <- setNames(bombay,c("tvec","count"))
f1 <- fitsir(bombay2,start=startfun(auto=TRUE,data=bombay2))
fpred1 <- SIR.detsim(bombay2$tvec,unlist(trans.pars(coef(f1))))
res <- ldply(setNames(6:3,paste0("ns",6:3)),
      function(x) {
    m <- lm(log(count)~ns(tvec,df=x),data=bombay2)
    data.frame(tvec=bombay2$tvec,var="log",pred=exp(predict(m)))
    },.id="method")
res2 <- ldply(setNames(6:3,paste0("ns",6:3)),
      function(x) {
    m <- glm(count~ns(tvec,df=x),
             family=gaussian(link="log"),data=bombay2)
    data.frame(tvec=bombay2$tvec,var="lin",pred=exp(predict(m)))
    },.id="method")
## ss$fit$nk ## number of knots
res3 <- rbind(data.frame(method="fitsir",
                         var="lin",tvec=bombay2$tvec,pred=fpred1),
             res,res2)
ggspline <- ggplot(res3,aes(tvec,pred))+
    geom_line(aes(colour=method,lty=var))+
    geom_point(data=bombay2,aes(y=count))
grid.arrange(ggspline,ggspline+scale_y_log10(),nrow=1)

stoch sim batch results

load("stochsim_4.rda")  ## main results
res2 <- res %>% data.frame(as.is=TRUE) %>% select(contains("mse")) %>% na.omit()
nsims <- length(na.omit(res[,"fitsir.1_time"]))

Order of columns in sim output is:

It turns out the fits with the linear-scale criterion have terrible MSEs, leaving these out for now.

labs <- c("fitsir_auto","fitsir_true",
          "smooth.spline","ns_4_lin","ns_4_log",
          "ns_6_lin","ns_6_log")
res3 <- setNames(res2,labs) %>% select(-contains("_lin"))
res3_diff <- as.data.frame(t(apply(res3,1,function(x) x/min(x))))
median_hilow_h <- function (x, ...)  {
    result <- do.call(Hmisc::smedian.hilow, list(x = quote(x), ...))
    plyr::rename(data.frame(t(result)), c(Median = "x", Mean = "x", 
        Lower = "xmin", Upper = "xmax"), warn_missing = FALSE)
}
res3g <- gather(res3,method,mse)
gsum <- res3g %>% group_by(method) %>%
    mutate(lmse = log10(sqrt(mse))) %>%
    summarise(median=median(lmse),min=quantile(lmse,0.025),
              max=quantile(lmse,0.975))
gghist <- ggplot(gather(res3,method,mse),aes(x=log10(mse),fill=method))+
    geom_histogram(alpha=0.4,position="identity",bins=30)
ggdens <- ggplot(gather(res3,method,mse),aes(x=log10(mse),fill=method))+
    geom_density(alpha=0.4,position="identity")
ggviolin <- ggplot(res3g,aes(x=log10(sqrt(mse)),y=method,fill=method))+
    geom_violinh()+
    scale_fill_brewer(palette="Set1")+
    theme(legend.position="none")+
    geom_pointrangeh(data=gsum,aes(x=median,xmin=min,xmax=max))
ggdensdiff <- ggdens %+% gather(res3_diff,method,mse)
## grid.arrange(ggdens,ggdensdiff,nrow=1)
print(ggviolin+ggtitle(sprintf("stoch sim comparison (n=%d)",nsims)))

(points/bars are median, 95% quantiles)

Conclusions:

```r res3g2 <- res3g %>% filter(method %in% c("fitsir_auto","ns_4_log")) gsum2 <- res3g2 %>% group_by(method) %>% mutate(lmse = log10(sqrt(mse))) %>% summarise(median=median(lmse),min=quantile(lmse,0.025), max=quantile(lmse,0.975)) ggviolin2 <- ggplot(res3g2,aes(x=log10(sqrt(mse)),y=method,fill=method))+ geom_violinh()+ scale_fill_brewer(palette="Set1")+ theme(legend.position="none")+ geom_pointrangeh(data=gsum2,aes(x=median,xmin=min,xmax=max)) ggsave(ggviolin2,file="ggviolin2.png")



bbolker/fitsir documentation built on June 4, 2019, 8:28 a.m.