knitr::opts_chunk$set( collapse = TRUE, echo=F, comment = "#>", fig.height = 3, fig.width = 5, fig.align = "center", dpi=300, out.width="600px" )
#Install the package #library(devtools) #devtools::install_github('https://github.com/weinbergerlab/InterventionEvaluatR') library(knitr) library(plyr) library(InterventionEvaluatR) library(rjags) library(coda) library(HDInterval) library(lubridate) library(pbapply) library(parallel) library(plyr) library(ggbiplot) library(htmlTable) library(nFactors) library(INLA) library(xtable) library(bsts)
This Rmd file contains analyses of the data from S Africa. It uses a slightly different model (Poisson) than the one described in Kleynhans et al. To see those results, see the synthetic_control_run.R file
In this file, we use ridge regression to weight the control diseases. This is done in the INLA package, which is computationally faster than the spike and slab variable selection that has been used in the past. In many instance, the results are very close between the spike and slab method and the ridge regression method, but there are some instances where differences arise. The two approaches are compared here.
n.time<-120 ar1.gen<-function(rho, prec.set){ y<-rep(NA, times=n.time) y[1]<- 0 for(i in 2:n.time){ y[i]<- y[i-1]*rho + rnorm(1,0, sqrt(1/(prec.set/(1-rho*rho)) )) } y<-y return(y) } set.seed(123) X1<-rpois(n.time, 15) X2<-rpois(n.time, 10) ar.piece<-ar1.gen(rho=0.7, prec.set=10) fixed.piece<- 0.2*log(X1) +0.3*log(X2) ts1<-exp(1+ar.piece + fixed.piece) ts1[108:120]<-ts1[108:120]*0.8 ts1.count<-rpois(length(ts1), ts1) dates<-seq.Date(from=as.Date('2000-01-01'), length.out = n.time, by='month') ds1<-cbind.data.frame('Y'=ts1.count,'rep'=1,'date'=dates, X1, X2, denom=1) ds1$rep<-as.factor(ds1$rep)
Here we need to set a few parameters. We Use the evaluatr.init() function to specify the name of the dataset, the date at which the vaccine is introduced, the date at which we want to begin evaluating the vaccine (typically 1-2 year after vaccine introduction). We also provide some information on the dataset, sch as whether the data are monthly or quarterly (n_seasons), the variable names for the grouping variable, the date variable, the outcome variable, and the denominator variable (if any).
#age.select=unique(sa2$age)[6] #age.select
analysis.ridge.iid <- evaluatr.init( country = "sim", data = ds1, post_period_start = "2008-01-01", eval_period_start = "2009-01-01", eval_period_end = "2009-12-01", n_seasons = 12, #This is monthly data, so select 12 year_def = "cal_year", # we are in southern hemisphere, so aggregate results by calendar year (Jan-Dec) group_name = "rep", #Strata categry name date_name = "date", #Date variable name outcome_name = "Y", #Outcome variable name denom_name = "denom", #Denominator variable name ridge=T, error_dist='iid' ) set.seed(1)
analysis.ridge.ar1 <- evaluatr.init( country = "sim", data = ds1, post_period_start = "2008-01-01", eval_period_start = "2009-01-01", eval_period_end = "2009-12-01", n_seasons = 12, #This is monthly data, so select 12 year_def = "cal_year", # we are in southern hemisphere, so aggregate results by calendar year (Jan-Dec) group_name = "rep", #Strata categry name date_name = "date", #Date variable name outcome_name = "Y", #Outcome variable name denom_name = "denom", #Denominator variable name ridge=T, error_dist='ar1' ) set.seed(1)
#source('inla_functions.R') #results1<-evaluatr.impact(analysis, variants=c('full', 'time', 'time_no_offset') ) ptm <- proc.time() results.ridge.iid<-evaluatr.impact(analysis.ridge.iid, variants=c('full', 'time', 'time_no_offset') ) proc.time() - ptm
#source('inla_functions.R') #results1<-evaluatr.impact(analysis, variants=c('full', 'time', 'time_no_offset') ) ptm <- proc.time() results.ridge.ar1<-evaluatr.impact(analysis.ridge.ar1, variants=c('full', 'time', 'time_no_offset') ) proc.time() - ptm
note:Adding more samples does not affect results at all. What is important is ensuring there are enough PCs in the A matrix when restricting the AR1 random effect. If based on PCs with variance>1, there is still residual bias. Just taking an arbitrary 5 PCs seems to do the trick.
Main results
compare.results<-function(res1){ results.table<- cbind.data.frame( #impact_results$best$rr_mean_intervals, res1$full$rr_mean_intervals, res1$time$rr_mean_intervals, res1$time_no_offset$rr_mean_intervals) #res1$its$rr_mean_intervals ) #impact_results$pca$rr_mean_intervals) table<-xtable(results.table) htmlTable(table) } compare.results(results.ridge.ar1) compare.results(results.ridge.iid)
How many cases were prevented from the time of vaccine introduction to the last time point in each stratum (+/- 95% CrI)? You can modify the number 'last.point' to pull out the cumulative number of cases at any point in the time series. In this case we are printing results fromthe SC model
print.cum<-function(res1){ last.point<-dim(res1$full$cumsum_prevented)[1] cum.prevented<-res1$full$cumsum_prevented[last.point,,] cum1<- round(t(cum.prevented)) cum2<- paste0(cum1[,'50%'], ' (', cum1[,'2.5%'],', ',cum1[,'97.5%'],')') cum3<-cbind.data.frame(row.names(cum1), cum2) names(cum3)<-c('Stratum','Cases Averted (95% CrI)') htmlTable(cum3, align='l')} print.cum(results.ridge.iid) print.cum(results.spike)
plots.ridge <- evaluatr.plots(analysis.ridge.iid) plots.ridge.ar1 <- evaluatr.plots(analysis.ridge.ar1)
This first plot shows a monthly time series, with observed, fitted, and counterfacual values. The observed number of deaths is shown in the black line. The fitted values for the pre-vaccine period are shown in the red dotted line, and the counterfactual estimate with its 95% credible interval is shown as the white dotted line and gray shaded area. if the black line is below the gray shaded area, this would indicate that obsrved cases are lower than expected based on changes in the control diseases in the post-vaccine period. If the controls appropriately adjust for underlying trends, then this would reflect an effect of the vaccine.
plots.ridge$groups[["1"]]$pred_full plots.ridge.ar1$groups[["1"]]$pred_full
It is sometimes easier to look at the results if we aggregate the observed and expected values up to the annual time scale. Here the observed values are shown as black dots. When the black dots go below the gray shaded area, this indicates that the observed cases are lower than expected based on changes in the control diseases in the post-vaccine period. If the controls appropriately adjust for underlying trends, then this would reflect an effect of the vaccine. The vertical line indicates the date of vaccine introduction. The year when the vaccine is introduced is included to the right of the line, even if the vaccine was introduced part-way through the year. For instance, regardless of whether the the vaccine was introduced in January 2009 or November 2009, the observed dot for 2009 would be to the right of the line.
plots.spike$groups[["1-4 years"]]$pred_full_agg
Finally, we can look at the cumulative cases prevented. In this example, there have been 445 cases prevented (95%CrI: 58, 931) from the time of vaccine introduction to the last day month of the study period. This is calculated by takin the difference between the observed and fitted number of cases in each month, and summing them. If atleast 1 control disease is identified from the synthetic controls model, then the result here is drawn from that model, otherwise, it is drawn from the STL+PCA model.
plots.spike$groups[["1-4 years"]]$cumsum_prevented
rho INLA
round(sapply(results.ridge.ar1$full$groups, function(x) x$rho[['0.5quant']], simplify='array'),2) #round(sapply(results.ridge.iid$full$groups, function(x) x$rho[['0.5quant']], simplify='array'),2)
betas INLA
beta1<-sapply(results.ridge.ar1$full$groups, function(x) x$beta, simplify=F) top.betas<-lapply(beta1, function(x){ x<-x[order(-x[,'beta.posterior.median']),] x[(1:(min(nrow(x),5))),1] }) top.betas
Compare contributions of fixed and random effects
fixed<-lapply(results.ridge.ar1$full$groups, function(x) x$fixed.effect.hdi) random<-lapply(results.ridge.ar1$full$groups, function(x) x$rand.eff.combined.q) fixed.iid<-lapply(results.ridge.iid$full$groups, function(x) x$fixed.effect.hdi) random.iid<-lapply(results.ridge.iid$full$groups, function(x) x$rand.eff.combined.q) par(mfrow=c(3,2)) for(i in (1: length(fixed))){ yrange<-range(c(fixed[[i]], random[[i]])) matplot(fixed[[i]], type='l', bty='l', lty=c(1,2,2), col='gray', ylim=yrange) matplot(random[[i]], type='l', bty='l', lty=c(2,1,2), col='gray', ylim=yrange) }
fixed<-lapply(results.ridge.iid$full$groups, function(x) x$fixed.effect.hdi) random<-lapply(results.ridge.iid$full$groups, function(x) x$rand.eff.combined.q) par(mfrow=c(3,2)) for(i in (1: length(fixed))){ yrange<-range(c(fixed[[i]], random[[i]])) matplot(fixed[[i]], type='l', bty='l', lty=c(1,2,2), col='gray', ylim=yrange) matplot(random[[i]], type='l', bty='l', lty=c(2,1,2), col='gray', ylim=yrange) }
Why does RR for AR1 and iid have same uncertainty interval?. The AR1 error piece is definitely wider here. But as plots above show, the unexplained error is small relative to the variation in the fixed effects, so when this AR1 piece gets added on to the regression piece, it has a small effect
set.seed(123) nsim=10000 nobs<-100 sd1<-2.5 mean1<-0 rho=0.5 sim.iid<- matrix(NA, nrow=nobs, ncol=nsim) sim.iid<-apply(sim.iid, 2, function(x) rnorm(n=nobs, mean=mean1, sd=sd1) ) sim.iid.sum<-apply(sim.iid, 2, function(x) sum(x) ) sim.iid.sum.sd<-sd(sim.iid.sum) sim.iid.sum.mean<-mean(sim.iid.sum) sim.iid.sum.q<-quantile(sim.iid.sum, probs=c(0.025,0.5,0.975)) sim.ar1<- matrix(NA, nrow=nobs, ncol=nsim) sim.ar1<-apply(sim.ar1, 2, function(x){ x[1]<-mean1 for(i in 2:length(x)){ x[i]<- x[i-1]*rho + rnorm(n=1, mean=0, sd=sd1) } return(x) } ) sim.ar1.sum<-apply(sim.ar1, 2, function(x) sum(x) ) sim.ar1.sum.sd<-sd(sim.ar1.sum) sim.ar1.sum.mean<-mean(sim.ar1.sum) sim.ar1.sum.q<-quantile(sim.ar1.sum, probs=c(0.025,0.5,0.975)) sim.ar1.sum.q sim.iid.sum.q matplot(sim.ar1[,1:10], type='l', ylim=c(-20, 20)) matplot(sim.iid[,1:10], type='l', ylim=c(-20,20)) sim.iid.sum.sd sim.ar1.sum.sd
#pointwise uqantile sim.ar1.q<-t(apply(sim.ar1,1,quantile, probs=c(0.025,0.5,0.975))) sim.iid.q<-t(apply(sim.iid,1,quantile, probs=c(0.025,0.5,0.975))) matplot(sim.ar1.q, type='l', lty=c(2,1,2), col='red') matplot(sim.iid.q, type='l', lty=c(2,1,2), col='blue', add=T)
try to run analysis outside of loop
analysis<-analysis.ridge.ar1 zoo_data=ds1 intervention_date=analysis$intervention_date n_seasons= analysis$n_seasons time_points= analysis$time_points error_dist=analysis$error_dist model.variant='full' y <- zoo_data[, 1] #all y y.pre <-y y.pre[time_points >= as.Date(intervention_date)]<-NA post_period_response<- y[time_points >= as.Date(intervention_date)] x.all <-as.matrix(zoo_data[,c('X1','X2')]) #Removes outcome column from dataset months<-model.matrix(~as.factor(month(zoo_data$date)), data=zoo_data)[,-1] y.pre[time_points>=analysis$intervention_date]<-NA #Filter columsn with 0 variations in the covariate in the pre-vax period #x.var<-apply(x,2, function(xx) var(xx)) #x.var[is.na(x.var)]<-0 #x<-x[,x.var>0] ##NEEDTO SEPRATE OUT SEASONAL DUMMIES FROM COVARS x<-x.all x.months<-months dimnames(x.months)[[2]]<-paste0('season', 1:(analysis$n_seasons-1)) #Should be already scaled, but doesn't hurt... x.scale<-apply(x,2, function(z) scale(z)) df1<-cbind.data.frame(y.pre, 'month'=as.factor(month(time_points)), y, x.scale ) df1$t<-1:nrow(df1) covar.df.full<-cbind.data.frame( x.scale, 'month'=df1$month) #Matrix for restriction random effects x.in.full<-as.data.frame(model.matrix(~. , data=covar.df.full))[,-1] #dummies for month, remove intercept mod.df.full<- cbind.data.frame(y,y.pre, x.in.full) A.full<- t(mod.df.full[,3:15]) A.full[,is.na(mod.df.full$y.pre)]<-0 #extraploation period shouldn't factor into constraint e.full=rep(0, nrow(A.full)) n <- nrow(mod.df.full) X <- matrix(1,nrow = n, ncol= 1) Z <- as.matrix(x.in.full) pX = ncol(X) pZ = ncol(Z) idx.X = c(1:pX, rep(NA, pZ)) idx.Z = c(rep(NA,pX), 1:pZ) hyper.fixed = list(prec = list(initial = log(0.00001), fixed=TRUE)) param.beta = list(prec = list(param = c(1.0e-3, 1.0e-3))) param.Z = param.beta param.data = list(prec = list(param = c(1.0e-3, 1.0e-3))) t=1:nrow(x.in.full) #http://www.r-inla.org/faq add.beta.cols<-as.data.frame(matrix(rep(1:ncol(x.in.full), each=nrow(x.in.full)), nrow=nrow(x.in.full))) names(add.beta.cols)<-paste0('beta', 1:ncol(x.in.full)) mod.df.full<-cbind.data.frame(mod.df.full,add.beta.cols) #Setup for Ridge regression #https://bitbucket.org/hrue/r-inla/src/0d9389b414979bcf3507c60f69a50488d18e4c6c/r-inla.org/examples/stacks/stacks.R?at=default x.in.full.no.season<-x.in.full[,-grep('month', names(x.in.full))] x.in.full.season<-x.in.full[,grep('month', names(x.in.full))] first.beta<-paste0("f(beta1,", names(x.in.full.no.season)[1], ",model='iid',hyper = param.beta,", "values=c(", paste(1:ncol(x.in.full.no.season), collapse=',') ,"))" ) next.beta.f<-function(x, covar.index){ paste0("f(beta",covar.index,",", names(x.in.full.no.season)[covar.index], ",copy='beta1', fixed=T)") } betas.list<- mapply(next.beta.f, x=x.in.full.no.season[-1], covar.index=2:ncol(x.in.full.no.season), SIMPLIFY=T) next.betas<- paste(betas.list, collapse='+') all.betas<-paste(first.beta,next.betas , sep='+') if(error_dist=='ar1'){ form1<- as.formula(paste0("y.pre ~", paste(names(x.in.full.season), collapse="+"),"+", all.betas, "+ f(t, model = 'ar1', constr=T,extraconstr=list(A=A.full, e=e.full))" ) ) #form1<- as.formula(paste0("y.pre ~", paste(names(x.in.full.season), collapse="+"),"+", all.betas, "+ f(t, model = 'ar1')") ) }else{ form1<- as.formula(paste0("y.pre ~", paste(names(x.in.full.season), collapse="+"),"+", all.betas, "+ f(t, model = 'iid')") ) } mod1.full = inla(form1, data = mod.df.full, control.predictor = list(compute=TRUE, link = 1), family='poisson', control.compute=list(config = TRUE,waic=TRUE)) waic<-mod1.full$waic$waic pd<- mod1.full$waic$p.eff ds<-list('fitted.model'=mod1.full, 'mod.df'=mod.df.full, 'x.in'=x.in.full,'offset'=NA,'offset.used'=F,'ridge'=T, 'waic'=waic, 'pd'=pd) if(ds$ridge==T){ coefs<-ds$fitted.model$summary.random[['beta1']] coefs$covar<-names(ds$x.in)[-grep('month',names(ds$x.in)) ] }else{ coefs<-ds$fitted.model$summary.fixed } coefs<-coefs[order(coefs$`0.5quant`),] posterior.list<-inla.posterior.sample(n=500, ds$fitted.model, seed=123) post.labels<-dimnames(posterior.list[[1]]$latent)[[1]] #post.labels<-gsub(":", "_", post.labels) posterior.samples<- sapply(posterior.list, '[[', 'latent') preds.select<-grep('Predictor',post.labels ) rand.eff.select.t1<-which(substr(post.labels,1,2 )=='t:') if(ds$ridge==T){ covar.select.betas<-grep('beta1:', post.labels) covar.select.months<-grep('month', post.labels) covar.select<-c(covar.select.betas,covar.select.months) }else{ covar.select<- which(names(ds$x.in) %in% sub("\\:.*", "", post.labels)) } beta.posterior<-(posterior.samples[covar.select,]) beta.posterior.median<- apply(beta.posterior,1,median) beta.posterior.hdi<-cbind(beta.posterior.median,t(hdi(t(beta.posterior), credMass = 0.95))) row.names(beta.posterior.hdi)<-names(ds$x.in) beta.fix.posterior.hdi<-beta.posterior.hdi[-grep('month',names(ds$x.in)),] fixed.effect<- as.matrix(ds$x.in) %*% beta.posterior #fixed piece of regression, excluding intercept fixed.effect.hdi<-t(hdi(t(fixed.effect), credMass = 0.95)) fixed.effect.median<-apply(fixed.effect,1, median) fixed.effect.hdi<-cbind.data.frame('median'=fixed.effect.median, fixed.effect.hdi) posterior.preds<-exp(posterior.samples[preds.select,]) #lambda #now take Poisson samples withmean of lambda posterior.preds.counts<- matrix(rpois(n=length(posterior.preds), lambda=posterior.preds), nrow=nrow(posterior.preds), ncol=ncol(posterior.preds)) rand.eff.t1<-posterior.samples[rand.eff.select.t1,] rand.eff1.q<-t(apply(rand.eff.t1, 1, quantile, probs=c(0.025,0.5,0.975))) posterior.preds.q<-t(apply(posterior.preds.counts,1,quantile, probs=c(0.025, 0.5, 0.975))) posterior.median<-as.integer(round(t(apply(posterior.preds.counts,1,median)))) ci<- t(hdi(t(posterior.preds.counts), credMass = 0.95)) posterior.pred.hdi<- cbind.data.frame('median'=posterior.median, ci) rho1<-ds$fitted.model$summary.hyperpar[3,c('0.5quant','0.025quant', '0.975quant')] rand.eff.combined<-rand.eff.t1 #+rand.eff.t2 rand.eff.combined.q<-t(apply(rand.eff.combined, 1, quantile, probs=c(0.025,0.5,0.975))) log.rr.pointwise<- apply(posterior.preds.counts, 2, function(x)log( (mod.df.full$y+1)/ (x+1)) ) log.rr.pointwise.ci<- t(hdi(t(log.rr.pointwise), credMass = 0.95)) log.rr.pointwise.median<- apply(log.rr.pointwise.ci,1,median) log_rr_full_t_hdi<-cbind(log.rr.pointwise.median,log.rr.pointwise.ci) # matplot(log.rr, type='l', col='gray', lty=c(2,1,2)) # abline(h=0, col='red') log_rr_full_t_quantiles<- t(apply(log.rr.pointwise,1,quantile, probs=c(0.025,0.5,0.975))) log_rr_full_t_sd<- apply(log.rr.pointwise,1,sd) #log_rr_full_t_samples.prec.post<-1/log_rr_full_t_sd^2 post.period<- which(time_points>= analysis$eval_period[1] &time_points<= analysis$eval_period[2] ) post.samples<-posterior.preds.counts[post.period,] post.samples.sum<-apply(post.samples,2,sum) obs.post.sum<- sum(mod.df.full$y[post.period]) rr.agg<-obs.post.sum/post.samples.sum rr.q<-quantile(rr.agg, probs=c(0.025, 0.5, 0.975)) rr.hdi<-c(rr.q['50%'],hdi(rr.agg, credMass = 0.95)) mean_rr<-mean(rr.agg) post.samples.sum.q<-quantile(post.samples.sum, probs=c(0.025, 0.5, 0.975)) pred.hdi <- cbind( post.samples.sum['50%'],hdi(post.samples.sum, credMass = 0.95)) year<-year(time_points) pred.sum.year.post<-apply(posterior.preds.counts,2, function(x) aggregate(x, by=list('year'=year), FUN=sum)) pred.sum.year.post<-sapply(pred.sum.year.post,function(x) x[,2]) pred.sum.year.ci<- t(hdi(t(pred.sum.year.post), credMass = 0.95)) pred_samples_post_full<-post.samples pred<-t(apply(posterior.preds.counts,1, quantile, probs=c(0.025, 0.5, 0.975)))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.