knitr::opts_chunk$set(echo = TRUE)
library(fitsir) library(bbmle) ## needed for coef() ... library(splines) ## for ns() library(plyr) ## for raply() library(dplyr) library(tibble) library(tidyr) library(ggplot2); theme_set(theme_bw()) library(gridExtra) library(ggstance) ## for horizontal violins
source("stochsim_funs.R")
set.seed(101) s0 <- simfun(rpars=list(size=10))
Various ways of fitting ...
## get starting values and trajectory based on them ## ss0 <- startfun(auto=TRUE,data=s0) ## use the generic (bad!) starting values ss0 <- startfun() ss2 <- SIR.detsim(s0$tvec,unlist(trans.pars(ss0))) ## fit and corresponding trajectory t1 <- system.time(f1 <- fitsir(s0,start=ss0)) ss3 <- SIR.detsim(s0$tvec,trans.pars(coef(f1))) ## GAM fit: match number of degrees of freedom ## glm() lists df as df+2 ## counting for intercept and residual var(?) m1 <- glm(count~ns(tvec,df=3), family=gaussian(link="log"),data=s0)
fitsir
works OK in this case even though we use the
generic starting parameters (not auto-fit, which is currently
broken):
par(las=1,bty="l") plot(count~tvec,data=s0) lines(s0$tvec,ss2) ## incidence/prevalence mismatch? lines(s0$tvec,ss3,col=2) ## decent fit anyway lines(s0$tvec,predict(m1,type="response"),col=4) legend("topright", c("start","fitsir","spline"), col=c(1,2,4),lty=1)
## takes about 1 minute per sim ... system.time(print(fitfun(s0)))
(autostart was used with these cached results ...)
simfn <- "stochsim.rda" if (file.exists(simfn)) { load(simfn) } else { set.seed(101) res1 <- raply(20,fitfun(simfun(rpars=list(size=10))), .progress="text") }
How well does this work?
par(las=1,bty="l") with(res1,hist(nll.SIR-nll.gam,col="gray",breaks=20, main=""))
fitsir
mostly works OK, doesn't need Latin hypercube for this examplefitfun(s0)[c("nll.SIR","nll.gam")] fitfun2(s0,plot.it=TRUE,log="y")
Note that in this case fitsir
has lower mean-squared-error, despite having a higher negative log-likelihood. This makes some sense because fitsir
appears better on the log scale ...
sizevec <- 10^seq(0,2,length.out=20) repvals <- 1:10 sres <- data.frame(expand.grid(size=sizevec,rep=repvals), cbind(fitsir=NA,spline=NA)) set.seed(101) for (i in 1:nrow(sres)) { sres[i,3:4] <- fitfun2(simfun(rpars=list(size=sres[i,"size"]))) } sres2 <- gather(sres,method,mse,-size,-rep) ggplot(sres2,aes(size,mse,colour=method))+ scale_x_log10()+ scale_y_log10()+ geom_point()+geom_smooth()+scale_colour_brewer(palette="Set1")
We may need pretty small size
values to get match the observed range of mean-squared error values ...
A first crack at comparing fitsir
and spline (three ways to do this -- (1) true start values; (2) generic start values; (3) LHS of starting values, pick best)
simfn2 <- "stochsim2.rda" fitfun.optim <- function(data) { t1 <- system.time(f1 <- fitsir.optim(data,start=startfun(auto=TRUE,data=data))) m1 <- glm(count~ns(tvec,df=3),family=gaussian(link="log"),data=data) res <- c(t=unname(t1["elapsed"]),coef(f1), nll.SIR=c(-logLik(f1)),nll.gam=c(-logLik(m1))) return(res) } if (file.exists(simfn2)) { load(simfn2) } else { set.seed(101) res1 <- raply(20,fitfun(simfun(rpars=list(size=10)))) save("res1",file=simfn2) }
from Dora:
Nquant <- setNames( c(9.908985e+00,4.492488e+02,2.096015e+03,2.878854e+04,2.865343e+31), seq(0,100,by=25)) I0quant <- setNames( c(6.573213e-193,6.255714e-04,9.216386e-03,4.933392e-02,9.990980e-01), seq(0,100,by=25)) m0 <- matrix(c(1.16,1.66,3.85,3.98,7.68,15.08,0.16,0.36,0.57,0.07,0.13,0.25, 1.12,1.34,2.11,3.61,6.85,11.07,0.15,0.28,0.55,0.09,0.14,0.28, 1.26,2.27,5.13,3.90,7.19,11.49,0.26,0.47,1.08,0.09,0.14,0.26, 1.24,1.88,8.60,4.71,12.97,42.06,0.16,0.31,0.73,0.02,0.08,0.21), ncol=4) dimnames(m0) <- list(paste(rep(c("R0","1/gamma","beta","gamma"),each=3), rep(paste0("Q",1:3),4),sep="_"), c("GB","BR","FI","ID")) m1 <- m0 %>% as.data.frame %>% rownames_to_column("var") %>% separate(var,c("var","quantile"),sep="_") %>% gather(country,value,-c(var,quantile)) ## mutate(qq=as.numeric(gsub("Q","",quantile))*0.25) ggplot(m1,aes(country,value))+geom_point()+ facet_wrap(~var,scale="free")
dt
)lm(y~ns())
How do we fit a reasonable spline model of equivalent complexity to fitsir
(with 4 total parameters)?
?ns
says
One can supply ‘df’ rather than knots; ‘ns()’ then chooses ‘df - 1 - intercept’ knots ...
however, the df
refers to the number of spline degrees of freedom; if we set df=3
we get 4 parameters (intercept+3). smooth.spline
with $k$ knots gives $k+2$ coefficients (but it's almost impossible to make it work with $k=2$ knots \ldots)
set.seed(101) dx <- data.frame(x=rnorm(100),y=rnorm(100)) length(coef(lm(y~ns(x,df=3),data=dx))) ss <- with(dx,smooth.spline(x,y,nknots=4)) length(ss$fit$coef)
df=4
/nknots=2
: ?splines::ns
says "". However, nknots=2
seems to break smooth.spline()
, which does some fancy internal calculations to compute the smoothing parameter (see ?smooth.spline
...)bombay2 <- setNames(bombay,c("tvec","count")) f1 <- fitsir(bombay2,start=startfun(auto=TRUE,data=bombay2)) fpred1 <- SIR.detsim(bombay2$tvec,unlist(trans.pars(coef(f1)))) res <- ldply(setNames(6:3,paste0("ns",6:3)), function(x) { m <- lm(log(count)~ns(tvec,df=x),data=bombay2) data.frame(tvec=bombay2$tvec,var="log",pred=exp(predict(m))) },.id="method") res2 <- ldply(setNames(6:3,paste0("ns",6:3)), function(x) { m <- glm(count~ns(tvec,df=x), family=gaussian(link="log"),data=bombay2) data.frame(tvec=bombay2$tvec,var="lin",pred=exp(predict(m))) },.id="method") ## ss$fit$nk ## number of knots res3 <- rbind(data.frame(method="fitsir", var="lin",tvec=bombay2$tvec,pred=fpred1), res,res2) ggspline <- ggplot(res3,aes(tvec,pred))+ geom_line(aes(colour=method,lty=var))+ geom_point(data=bombay2,aes(y=count)) grid.arrange(ggspline,ggspline+scale_y_log10(),nrow=1)
load("stochsim_4.rda") ## main results res2 <- res %>% data.frame(as.is=TRUE) %>% select(contains("mse")) %>% na.omit() nsims <- length(na.omit(res[,"fitsir.1_time"]))
Order of columns in sim output is:
ns()
with 4 coefficients, lin/log scale criterionns()
with 6 coeff, dittoIt turns out the fits with the linear-scale criterion have terrible MSEs, leaving these out for now.
labs <- c("fitsir_auto","fitsir_true", "smooth.spline","ns_4_lin","ns_4_log", "ns_6_lin","ns_6_log") res3 <- setNames(res2,labs) %>% select(-contains("_lin")) res3_diff <- as.data.frame(t(apply(res3,1,function(x) x/min(x)))) median_hilow_h <- function (x, ...) { result <- do.call(Hmisc::smedian.hilow, list(x = quote(x), ...)) plyr::rename(data.frame(t(result)), c(Median = "x", Mean = "x", Lower = "xmin", Upper = "xmax"), warn_missing = FALSE) } res3g <- gather(res3,method,mse) gsum <- res3g %>% group_by(method) %>% mutate(lmse = log10(sqrt(mse))) %>% summarise(median=median(lmse),min=quantile(lmse,0.025), max=quantile(lmse,0.975))
gghist <- ggplot(gather(res3,method,mse),aes(x=log10(mse),fill=method))+ geom_histogram(alpha=0.4,position="identity",bins=30) ggdens <- ggplot(gather(res3,method,mse),aes(x=log10(mse),fill=method))+ geom_density(alpha=0.4,position="identity") ggviolin <- ggplot(res3g,aes(x=log10(sqrt(mse)),y=method,fill=method))+ geom_violinh()+ scale_fill_brewer(palette="Set1")+ theme(legend.position="none")+ geom_pointrangeh(data=gsum,aes(x=median,xmin=min,xmax=max)) ggdensdiff <- ggdens %+% gather(res3_diff,method,mse) ## grid.arrange(ggdens,ggdensdiff,nrow=1) print(ggviolin+ggtitle(sprintf("stoch sim comparison (n=%d)",nsims)))
(points/bars are median, 95% quantiles)
Conclusions:
smooth.spline
with 6 coefficients (= 4 knots), and ns()
with 6 coefficients (spline.df
= 5) give approximately equal MSE distributions, better than the rest (but have more parameters)ns()
with 4 coefficients (spline.df
=3) does worse (median) than fitsir
, but worst-case scenario is better ...fitsir
starting from true parameter values (best-case scenario?) does slightly better than the auto-start, but not huge differences```r res3g2 <- res3g %>% filter(method %in% c("fitsir_auto","ns_4_log")) gsum2 <- res3g2 %>% group_by(method) %>% mutate(lmse = log10(sqrt(mse))) %>% summarise(median=median(lmse),min=quantile(lmse,0.025), max=quantile(lmse,0.975)) ggviolin2 <- ggplot(res3g2,aes(x=log10(sqrt(mse)),y=method,fill=method))+ geom_violinh()+ scale_fill_brewer(palette="Set1")+ theme(legend.position="none")+ geom_pointrangeh(data=gsum2,aes(x=median,xmin=min,xmax=max)) ggsave(ggviolin2,file="ggviolin2.png")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.