In weinbergerlab/InterventionEvaluatR: InterventionEvaluatR

knitr::opts_chunk$set(
  collapse = TRUE,
  echo=F,
  comment = "#>",
  fig.height = 3,
  fig.width = 5,
  fig.align = "center", 
  dpi=300, 
    out.width="600px"
)

#Install the package
#library(devtools)
#devtools::install_github('https://github.com/weinbergerlab/InterventionEvaluatR') 
library(knitr)
library(plyr)
library(InterventionEvaluatR)
library(rjags)
library(coda)
library(HDInterval)
library(lubridate)
library(pbapply)
library(parallel)
library(plyr)
library(ggbiplot)
library(htmlTable)
library(nFactors)
library(INLA)
library(xtable)


library(bsts)

title: "Estimated change associated with the introduction of vaccine in South Africa"

Important note

This Rmd file contains analyses of the data from S Africa. It uses a slightly different model (Poisson) than the one described in Kleynhans et al. To see those results, see the synthetic_control_run.R file

In this file, we use ridge regression to weight the control diseases. This is done in the INLA package, which is computationally faster than the spike and slab variable selection that has been used in the past. In many instance, the results are very close between the spike and slab method and the ridge regression method, but there are some instances where differences arise. The two approaches are compared here.

Prepare the dataset

  n.time<-120

ar1.gen<-function(rho, prec.set){
  y<-rep(NA, times=n.time)
  y[1]<-  0
  for(i in 2:n.time){
    y[i]<- y[i-1]*rho +  rnorm(1,0, sqrt(1/(prec.set/(1-rho*rho)) ))
  }
  y<-y 
  return(y)
}

set.seed(123)

X1<-rpois(n.time, 15)
X2<-rpois(n.time, 10)
ar.piece<-ar1.gen(rho=0.7, prec.set=10) 
fixed.piece<- 0.2*log(X1) +0.3*log(X2)
ts1<-exp(1+ar.piece + fixed.piece)
ts1[108:120]<-ts1[108:120]*0.8
ts1.count<-rpois(length(ts1), ts1)
dates<-seq.Date(from=as.Date('2000-01-01'), length.out = n.time, by='month')
ds1<-cbind.data.frame('Y'=ts1.count,'rep'=1,'date'=dates, X1, X2, denom=1)
ds1$rep<-as.factor(ds1$rep)

Set parameters for analysis

Here we need to set a few parameters. We Use the evaluatr.init() function to specify the name of the dataset, the date at which the vaccine is introduced, the date at which we want to begin evaluating the vaccine (typically 1-2 year after vaccine introduction). We also provide some information on the dataset, sch as whether the data are monthly or quarterly (n_seasons), the variable names for the grouping variable, the date variable, the outcome variable, and the denominator variable (if any).

Select which age group to analyze

#age.select=unique(sa2$age)[6]
#age.select

analysis.ridge.iid <- evaluatr.init(
  country = "sim", data = ds1,
  post_period_start = "2008-01-01", 
  eval_period_start = "2009-01-01", 
  eval_period_end = "2009-12-01", 
  n_seasons = 12, #This is monthly data, so select 12
  year_def = "cal_year", # we are in southern hemisphere, so aggregate results by calendar year (Jan-Dec)
  group_name = "rep",  #Strata categry name
  date_name = "date", #Date variable name
  outcome_name = "Y", #Outcome variable name
  denom_name = "denom", #Denominator variable name
  ridge=T,  error_dist='iid'
)
set.seed(1)

analysis.ridge.ar1 <- evaluatr.init(
  country = "sim", data = ds1,
  post_period_start = "2008-01-01", 
  eval_period_start = "2009-01-01", 
  eval_period_end = "2009-12-01", 
  n_seasons = 12, #This is monthly data, so select 12
  year_def = "cal_year", # we are in southern hemisphere, so aggregate results by calendar year (Jan-Dec)
  group_name = "rep",  #Strata categry name
  date_name = "date", #Date variable name
  outcome_name = "Y", #Outcome variable name
  denom_name = "denom", #Denominator variable name
  ridge=T,  error_dist='ar1'
)
set.seed(1)

#source('inla_functions.R')
#results1<-evaluatr.impact(analysis, variants=c('full', 'time', 'time_no_offset') )

ptm <- proc.time()
results.ridge.iid<-evaluatr.impact(analysis.ridge.iid, variants=c('full', 'time', 'time_no_offset') )
proc.time() - ptm

#source('inla_functions.R')
#results1<-evaluatr.impact(analysis, variants=c('full', 'time', 'time_no_offset') )

ptm <- proc.time()
results.ridge.ar1<-evaluatr.impact(analysis.ridge.ar1, variants=c('full', 'time', 'time_no_offset') )
proc.time() - ptm

note:Adding more samples does not affect results at all. What is important is ensuring there are enough PCs in the A matrix when restricting the AR1 random effect. If based on PCs with variance>1, there is still residual bias. Just taking an arbitrary 5 PCs seems to do the trick.

Main results

compare.results<-function(res1){
results.table<- cbind.data.frame(
  #impact_results$best$rr_mean_intervals, 
  res1$full$rr_mean_intervals, 
  res1$time$rr_mean_intervals, 
  res1$time_no_offset$rr_mean_intervals) 
  #res1$its$rr_mean_intervals )
  #impact_results$pca$rr_mean_intervals)
  table<-xtable(results.table)
  htmlTable(table)
}
compare.results(results.ridge.ar1)
compare.results(results.ridge.iid)

Cases averted

How many cases were prevented from the time of vaccine introduction to the last time point in each stratum (+/- 95% CrI)? You can modify the number 'last.point' to pull out the cumulative number of cases at any point in the time series. In this case we are printing results fromthe SC model

print.cum<-function(res1){
last.point<-dim(res1$full$cumsum_prevented)[1]
cum.prevented<-res1$full$cumsum_prevented[last.point,,]
cum1<- round(t(cum.prevented))
cum2<- paste0(cum1[,'50%'], ' (', cum1[,'2.5%'],', ',cum1[,'97.5%'],')')
cum3<-cbind.data.frame(row.names(cum1), cum2)
names(cum3)<-c('Stratum','Cases Averted (95% CrI)')
  htmlTable(cum3, align='l')}

print.cum(results.ridge.iid)
print.cum(results.spike)

plots.ridge <- evaluatr.plots(analysis.ridge.iid)
plots.ridge.ar1 <- evaluatr.plots(analysis.ridge.ar1)

Plot the results for 1 age group

First look at the results from the synthetic controls model for 1 age group.

This first plot shows a monthly time series, with observed, fitted, and counterfacual values. The observed number of deaths is shown in the black line. The fitted values for the pre-vaccine period are shown in the red dotted line, and the counterfactual estimate with its 95% credible interval is shown as the white dotted line and gray shaded area. if the black line is below the gray shaded area, this would indicate that obsrved cases are lower than expected based on changes in the control diseases in the post-vaccine period. If the controls appropriately adjust for underlying trends, then this would reflect an effect of the vaccine.

  plots.ridge$groups[["1"]]$pred_full 
  plots.ridge.ar1$groups[["1"]]$pred_full

It is sometimes easier to look at the results if we aggregate the observed and expected values up to the annual time scale. Here the observed values are shown as black dots. When the black dots go below the gray shaded area, this indicates that the observed cases are lower than expected based on changes in the control diseases in the post-vaccine period. If the controls appropriately adjust for underlying trends, then this would reflect an effect of the vaccine. The vertical line indicates the date of vaccine introduction. The year when the vaccine is introduced is included to the right of the line, even if the vaccine was introduced part-way through the year. For instance, regardless of whether the the vaccine was introduced in January 2009 or November 2009, the observed dot for 2009 would be to the right of the line.

      plots.spike$groups[["1-4 years"]]$pred_full_agg

Finally, we can look at the cumulative cases prevented. In this example, there have been 445 cases prevented (95%CrI: 58, 931) from the time of vaccine introduction to the last day month of the study period. This is calculated by takin the difference between the observed and fitted number of cases in each month, and summing them. If atleast 1 control disease is identified from the synthetic controls model, then the result here is drawn from that model, otherwise, it is drawn from the STL+PCA model.

      plots.spike$groups[["1-4 years"]]$cumsum_prevented

rho INLA

round(sapply(results.ridge.ar1$full$groups, function(x) x$rho[['0.5quant']], simplify='array'),2)
#round(sapply(results.ridge.iid$full$groups, function(x) x$rho[['0.5quant']], simplify='array'),2)

betas INLA

beta1<-sapply(results.ridge.ar1$full$groups, function(x) x$beta, simplify=F)
top.betas<-lapply(beta1, function(x){
  x<-x[order(-x[,'beta.posterior.median']),]
  x[(1:(min(nrow(x),5))),1]
  })
top.betas

Compare contributions of fixed and random effects

fixed<-lapply(results.ridge.ar1$full$groups, function(x) x$fixed.effect.hdi)
random<-lapply(results.ridge.ar1$full$groups, function(x) x$rand.eff.combined.q)

fixed.iid<-lapply(results.ridge.iid$full$groups, function(x) x$fixed.effect.hdi)
random.iid<-lapply(results.ridge.iid$full$groups, function(x) x$rand.eff.combined.q)
  par(mfrow=c(3,2))

for(i in (1: length(fixed))){
  yrange<-range(c(fixed[[i]], random[[i]]))
  matplot(fixed[[i]], type='l', bty='l', lty=c(1,2,2), col='gray', ylim=yrange)
    matplot(random[[i]], type='l', bty='l', lty=c(2,1,2), col='gray', ylim=yrange)

}

fixed<-lapply(results.ridge.iid$full$groups, function(x) x$fixed.effect.hdi)
random<-lapply(results.ridge.iid$full$groups, function(x) x$rand.eff.combined.q)
  par(mfrow=c(3,2))

for(i in (1: length(fixed))){
  yrange<-range(c(fixed[[i]], random[[i]]))
  matplot(fixed[[i]], type='l', bty='l', lty=c(1,2,2), col='gray', ylim=yrange)
    matplot(random[[i]], type='l', bty='l', lty=c(2,1,2), col='gray', ylim=yrange)

}

Why does RR for AR1 and iid have same uncertainty interval?. The AR1 error piece is definitely wider here. But as plots above show, the unexplained error is small relative to the variation in the fixed effects, so when this AR1 piece gets added on to the regression piece, it has a small effect

set.seed(123)
nsim=10000
nobs<-100
sd1<-2.5
mean1<-0
rho=0.5

sim.iid<- matrix(NA, nrow=nobs, ncol=nsim)
sim.iid<-apply(sim.iid, 2, function(x) rnorm(n=nobs, mean=mean1, sd=sd1)  )
sim.iid.sum<-apply(sim.iid, 2, function(x) sum(x) )
sim.iid.sum.sd<-sd(sim.iid.sum)
sim.iid.sum.mean<-mean(sim.iid.sum)
sim.iid.sum.q<-quantile(sim.iid.sum, probs=c(0.025,0.5,0.975))

sim.ar1<- matrix(NA, nrow=nobs, ncol=nsim)
sim.ar1<-apply(sim.ar1, 2, function(x){
  x[1]<-mean1
  for(i in 2:length(x)){
    x[i]<- x[i-1]*rho + rnorm(n=1, mean=0, sd=sd1) 
  }
  return(x)
} )
sim.ar1.sum<-apply(sim.ar1, 2, function(x) sum(x) )
sim.ar1.sum.sd<-sd(sim.ar1.sum)
sim.ar1.sum.mean<-mean(sim.ar1.sum)
sim.ar1.sum.q<-quantile(sim.ar1.sum, probs=c(0.025,0.5,0.975))

sim.ar1.sum.q
sim.iid.sum.q

 matplot(sim.ar1[,1:10], type='l', ylim=c(-20, 20))
 matplot(sim.iid[,1:10], type='l', ylim=c(-20,20))



sim.iid.sum.sd
sim.ar1.sum.sd

#pointwise uqantile
sim.ar1.q<-t(apply(sim.ar1,1,quantile, probs=c(0.025,0.5,0.975)))
sim.iid.q<-t(apply(sim.iid,1,quantile, probs=c(0.025,0.5,0.975)))

matplot(sim.ar1.q, type='l', lty=c(2,1,2), col='red')
matplot(sim.iid.q, type='l', lty=c(2,1,2), col='blue', add=T)

try to run analysis outside of loop

analysis<-analysis.ridge.ar1
zoo_data=ds1
                    intervention_date=analysis$intervention_date
                    n_seasons= analysis$n_seasons
                    time_points= analysis$time_points
                    error_dist=analysis$error_dist
                    model.variant='full'

  y <- zoo_data[, 1] #all y
  y.pre <-y
  y.pre[time_points >= as.Date(intervention_date)]<-NA
  post_period_response<- y[time_points >= as.Date(intervention_date)]
  x.all <-as.matrix(zoo_data[,c('X1','X2')]) #Removes outcome column from dataset
  months<-model.matrix(~as.factor(month(zoo_data$date)), data=zoo_data)[,-1]

  y.pre[time_points>=analysis$intervention_date]<-NA

  #Filter columsn with 0 variations in the covariate in the pre-vax period
  #x.var<-apply(x,2, function(xx) var(xx))  
  #x.var[is.na(x.var)]<-0
  #x<-x[,x.var>0] 

  ##NEEDTO SEPRATE OUT SEASONAL DUMMIES FROM COVARS
  x<-x.all
  x.months<-months
  dimnames(x.months)[[2]]<-paste0('season', 1:(analysis$n_seasons-1))

  #Should be already scaled, but doesn't hurt...
  x.scale<-apply(x,2, function(z) scale(z)) 

  df1<-cbind.data.frame(y.pre, 'month'=as.factor(month(time_points)), y, x.scale )
  df1$t<-1:nrow(df1)
  covar.df.full<-cbind.data.frame( x.scale, 'month'=df1$month) 

  #Matrix for restriction random effects
  x.in.full<-as.data.frame(model.matrix(~. , data=covar.df.full))[,-1] #dummies for month, remove intercept
  mod.df.full<- cbind.data.frame(y,y.pre, x.in.full)


    A.full<- t(mod.df.full[,3:15])
    A.full[,is.na(mod.df.full$y.pre)]<-0  #extraploation period shouldn't factor into constraint
    e.full=rep(0, nrow(A.full))

    n <- nrow(mod.df.full)
    X <- matrix(1,nrow = n, ncol= 1)
    Z <- as.matrix(x.in.full) 
    pX = ncol(X) 
    pZ = ncol(Z)
    idx.X = c(1:pX, rep(NA, pZ))
    idx.Z = c(rep(NA,pX), 1:pZ)
    hyper.fixed = list(prec = list(initial = log(0.00001), fixed=TRUE))
    param.beta = list(prec = list(param = c(1.0e-3, 1.0e-3))) 
    param.Z =  param.beta   
    param.data = list(prec = list(param = c(1.0e-3, 1.0e-3)))
    t=1:nrow(x.in.full)
    #http://www.r-inla.org/faq
    add.beta.cols<-as.data.frame(matrix(rep(1:ncol(x.in.full), each=nrow(x.in.full)), nrow=nrow(x.in.full)))
    names(add.beta.cols)<-paste0('beta', 1:ncol(x.in.full))
    mod.df.full<-cbind.data.frame(mod.df.full,add.beta.cols)
    #Setup for Ridge regression
    #https://bitbucket.org/hrue/r-inla/src/0d9389b414979bcf3507c60f69a50488d18e4c6c/r-inla.org/examples/stacks/stacks.R?at=default
    x.in.full.no.season<-x.in.full[,-grep('month', names(x.in.full))]
    x.in.full.season<-x.in.full[,grep('month', names(x.in.full))]
    first.beta<-paste0("f(beta1,", names(x.in.full.no.season)[1], ",model='iid',hyper = param.beta,", "values=c(", paste(1:ncol(x.in.full.no.season), collapse=',') ,"))" )
    next.beta.f<-function(x, covar.index){ 
      paste0("f(beta",covar.index,",", names(x.in.full.no.season)[covar.index], ",copy='beta1', fixed=T)")
    }
    betas.list<- mapply(next.beta.f, x=x.in.full.no.season[-1], covar.index=2:ncol(x.in.full.no.season), SIMPLIFY=T)
    next.betas<- paste(betas.list, collapse='+')  
    all.betas<-paste(first.beta,next.betas , sep='+')    

    if(error_dist=='ar1'){
    form1<- as.formula(paste0("y.pre ~", paste(names(x.in.full.season), collapse="+"),"+", all.betas,    "+ f(t, model = 'ar1', constr=T,extraconstr=list(A=A.full, e=e.full))" ) )
    #form1<- as.formula(paste0("y.pre ~", paste(names(x.in.full.season), collapse="+"),"+", all.betas,    "+ f(t, model = 'ar1')") )

       }else{
    form1<- as.formula(paste0("y.pre ~", paste(names(x.in.full.season), collapse="+"),"+", all.betas,    "+ f(t, model = 'iid')") )

    }

    mod1.full = inla(form1, data = mod.df.full, 
                     control.predictor = list(compute=TRUE, link = 1), 
                     family='poisson',
                     control.compute=list(config = TRUE,waic=TRUE))
    waic<-mod1.full$waic$waic
    pd<- mod1.full$waic$p.eff
    ds<-list('fitted.model'=mod1.full, 'mod.df'=mod.df.full, 'x.in'=x.in.full,'offset'=NA,'offset.used'=F,'ridge'=T, 'waic'=waic, 'pd'=pd)


  if(ds$ridge==T){  
    coefs<-ds$fitted.model$summary.random[['beta1']]
    coefs$covar<-names(ds$x.in)[-grep('month',names(ds$x.in)) ]
  }else{
    coefs<-ds$fitted.model$summary.fixed
  }
  coefs<-coefs[order(coefs$`0.5quant`),]

  posterior.list<-inla.posterior.sample(n=500, ds$fitted.model, seed=123)
  post.labels<-dimnames(posterior.list[[1]]$latent)[[1]]
  #post.labels<-gsub(":", "_", post.labels)
  posterior.samples<- sapply(posterior.list, '[[', 'latent')
  preds.select<-grep('Predictor',post.labels )
  rand.eff.select.t1<-which(substr(post.labels,1,2 )=='t:')

  if(ds$ridge==T){  
    covar.select.betas<-grep('beta1:', post.labels)
    covar.select.months<-grep('month', post.labels)
    covar.select<-c(covar.select.betas,covar.select.months)
  }else{
    covar.select<- which(names(ds$x.in)  %in% sub("\\:.*", "", post.labels))
  }

  beta.posterior<-(posterior.samples[covar.select,]) 
  beta.posterior.median<- apply(beta.posterior,1,median)
  beta.posterior.hdi<-cbind(beta.posterior.median,t(hdi(t(beta.posterior), credMass = 0.95)))
  row.names(beta.posterior.hdi)<-names(ds$x.in)

  beta.fix.posterior.hdi<-beta.posterior.hdi[-grep('month',names(ds$x.in)),]

  fixed.effect<- as.matrix(ds$x.in) %*% beta.posterior #fixed piece of regression, excluding intercept
  fixed.effect.hdi<-t(hdi(t(fixed.effect), credMass = 0.95))
  fixed.effect.median<-apply(fixed.effect,1, median)
  fixed.effect.hdi<-cbind.data.frame('median'=fixed.effect.median, fixed.effect.hdi)

  posterior.preds<-exp(posterior.samples[preds.select,]) #lambda
  #now take Poisson samples withmean of lambda
  posterior.preds.counts<- matrix(rpois(n=length(posterior.preds), lambda=posterior.preds), nrow=nrow(posterior.preds), ncol=ncol(posterior.preds))

  rand.eff.t1<-posterior.samples[rand.eff.select.t1,]
  rand.eff1.q<-t(apply(rand.eff.t1, 1, quantile, probs=c(0.025,0.5,0.975)))

  posterior.preds.q<-t(apply(posterior.preds.counts,1,quantile, probs=c(0.025, 0.5, 0.975)))
  posterior.median<-as.integer(round(t(apply(posterior.preds.counts,1,median))))
  ci<- t(hdi(t(posterior.preds.counts), credMass = 0.95))
  posterior.pred.hdi<- cbind.data.frame('median'=posterior.median, ci)

  rho1<-ds$fitted.model$summary.hyperpar[3,c('0.5quant','0.025quant', '0.975quant')]

  rand.eff.combined<-rand.eff.t1 #+rand.eff.t2
  rand.eff.combined.q<-t(apply(rand.eff.combined, 1, quantile, probs=c(0.025,0.5,0.975)))

  log.rr.pointwise<- apply(posterior.preds.counts, 2, function(x)log( (mod.df.full$y+1)/ (x+1)) )
  log.rr.pointwise.ci<- t(hdi(t(log.rr.pointwise), credMass = 0.95))
  log.rr.pointwise.median<- apply(log.rr.pointwise.ci,1,median)
  log_rr_full_t_hdi<-cbind(log.rr.pointwise.median,log.rr.pointwise.ci)
  # matplot(log.rr, type='l', col='gray', lty=c(2,1,2))
  # abline(h=0, col='red')
  log_rr_full_t_quantiles<- t(apply(log.rr.pointwise,1,quantile, probs=c(0.025,0.5,0.975)))
  log_rr_full_t_sd<- apply(log.rr.pointwise,1,sd)
  #log_rr_full_t_samples.prec.post<-1/log_rr_full_t_sd^2

  post.period<- which(time_points>= analysis$eval_period[1]  &time_points<= analysis$eval_period[2] )

  post.samples<-posterior.preds.counts[post.period,]

  post.samples.sum<-apply(post.samples,2,sum)
  obs.post.sum<- sum(mod.df.full$y[post.period])
  rr.agg<-obs.post.sum/post.samples.sum
  rr.q<-quantile(rr.agg, probs=c(0.025, 0.5, 0.975)) 
  rr.hdi<-c(rr.q['50%'],hdi(rr.agg, credMass = 0.95))
  mean_rr<-mean(rr.agg)

  post.samples.sum.q<-quantile(post.samples.sum, probs=c(0.025, 0.5, 0.975))
  pred.hdi <- cbind( post.samples.sum['50%'],hdi(post.samples.sum, credMass = 0.95))

  year<-year(time_points)  
  pred.sum.year.post<-apply(posterior.preds.counts,2, function(x) aggregate(x, by=list('year'=year), FUN=sum))
  pred.sum.year.post<-sapply(pred.sum.year.post,function(x) x[,2])
  pred.sum.year.ci<- t(hdi(t(pred.sum.year.post), credMass = 0.95))

  pred_samples_post_full<-post.samples
  pred<-t(apply(posterior.preds.counts,1, quantile, probs=c(0.025, 0.5, 0.975)))