In eeholmes/CoV19: Eli's package for figures and data on CoV-19 epidemic

knitr::opts_chunk$set(echo = FALSE, warning=FALSE, message=FALSE)

library(CoV19)
library(ggplot2)
library(gridExtra)

abb.names1 <- c("newengland.abb", "midatlantic.abb", "encentralmidwest.abb", "wncentralmidwest.abb",
               "nsatlantic.abb", "ssatlantic.abb", "south.abb", "mtnnorth.abb", "mtnsw.abb", "westcoast.abb")
abb.names2 <- c("northeast.abb", "atlantic.abb", "midwest.abb", "south.abb", "west.abb")
newengland.abb <- c("ME","VT","NH","MA","RI","CT")
midatlantic.abb <- c("NY","NJ","PA")
encentralmidwest.abb <- c("OH", "IN", "IL", "WI", "MI")
wncentralmidwest.abb <- c("MN", "MO", "ND", "SD", "IA", "NE", "KS")
nsatlantic.abb <- c("DC", "DE", "VA", "WV", "MD")
ssatlantic.abb <- c("GA", "FL", "SC", "NC")
south.abb <- c("AL", "MS", "KY", "TN", "TX", "OK", "AR","LA")
mtnnorth.abb <- c("ID", "MT", "WY")
mtnsw.abb <- c("CO", "UT", "NV", "NM", "AZ")
westcoast.abb <- c("WA", "OR", "CA")
west.abb <- c(mtnnorth.abb, mtnsw.abb, westcoast.abb)
midwest.abb <- c(encentralmidwest.abb, wncentralmidwest.abb)
atlantic.abb <- c(nsatlantic.abb, ssatlantic.abb)
northeast.abb <- c(newengland.abb, midatlantic.abb)

# estimate mu, sigma and scaling
f2<-function(x, theta) {
  m<-theta[1]; s<-theta[2]; a<-theta[3];
  a*exp(-0.5*((x-m)/s)^2)
}
f2<-function(x, theta) {
  m<-theta[1]; s<-theta[2]; a<-theta[3];
  x <- x-m
  a*exp(-0.5*(x/s)^2)
}
# fix m
f3<-function(x, theta, m) {
  s<-theta[1]; a<-theta[2];
  a*exp(-0.5*((x-m)/s)^2)
}
# fix s
f4<-function(x, theta, s) {
  m<-theta[1]; a<-theta[2];
  a*exp(-0.5*((x-m)/s)^2)
}
f1<-function(x, theta, a) {
  m<-theta[1]; s<-theta[2];
  a*exp(-0.5*((x-m)/s)^2)
}

fitf1<- function(subx, m=5, a=1, s=0.08){
  # m is actual m not log(day)
  f1<-function(x, theta, a) {
    m<-theta[1]; s<-theta[2];
    ifelse(s<0,-1,1)*a*exp(-0.5*((x-m)/s)^2)
  }
  f1<-function(x, theta, a) {
    m<-theta[1]; s<-theta[2];
    x <- x-m
    ifelse(s<0,-1,1)*a*exp(-0.5*(x/s)^2)
  }
  for(m in c(m, seq(.5,2,.1)*m)){
    fit1<-try(nls(y~f1(x,c(m,s),a), data=subx, start=list(m=m, s=s), control = list(maxiter = 500), algorithm="port", lower=c(0,0.03)), silent=TRUE)
    if(!inherits(fit1, "try-error")) break
  }
  return(fit1)
}

signx <- function(x) return(ifelse(x<0,-1,1))

getcis <- function(fit.best, subx, m=20, s=0.2, sig=0.05, level=0.05){
  if(coef(fit.best)[3] > 20) return(matrix(NA,3,2))
  m <- coef(fit.best)[1]
  s <- coef(fit.best)[2]
  sig <- max(summary(fit.best)$sigma, sig)
  err.best <- residuals(fit.best)
  n <- length(err.best)
  val=c()
  aval <- seq(.5, 10*coef(fit.best)[3],0.1)
  for(a in aval){
    fit <- fitf1(subx, a=a, m=m, s=s)
    err <- residuals(fit)
    rss=sum(err^2, na.rm=TRUE)
    Fval <- (rss-sum(err.best^2, na.rm=TRUE))/sig^2
    tauval <- signx(a-coef(fit.best)[3])*sqrt(Fval)
    if(!is.na(tauval) && tauval>3 && tauval<5) break #catch bad fits with < 5
    if(is.na(tauval)) next
    val <- rbind(val, c(rss=rss, tau=tauval,
                        coef(fit),a=a))
  }
  val <- val[val[,"tau.a"]> -5 & val[,"tau.a"]< 5,]
  mina <- max(which(val[,"tau.a"]<qt(level/2,n-3)),1)
  maxa <- min(which(val[,"tau.a"]>qt(1-level/2,n-3)),nrow(val))
  ret <- cbind(val[mina,c("m", "s", "a")], val[maxa,c("m", "s", "a")])
  if(ret[1,2]<m) ret[,2] <- NA
  if(ret[1,1]>m) ret[,2] <- NA
  return(cbind(val[mina,c("m", "s", "a")], val[maxa,c("m", "s", "a")]))
}

# Get the fit
getfit <- function(x, posmin=100, maxh=200, fun="f2", s.fix=0.08, sig=0.05, fitit=TRUE, fil=7, sub=1){
  x$new.cases.smooth <- stats::filter(x$new.cases, rep(1/fil,fil))
  x$new.cases.smooth <- c(rep(NA,floor(fil/2)),zoo::rollapply(x$new.cases, fil, mean, na.rm=TRUE), rep(NA,floor(fil/2)))
  day1 <- min(which(x$positive>posmin))
  x$day <- 1:nrow(x) - day1 + 1
  x$x <- log(x$day)

  subx <- subset(x, positive>posmin)[,c("date", "positive", "region", "new.cases", "new.cases.smooth", "day", "x")]
  subx <- na.omit(subx)
  subx$x <- log(subx$day)
  subx$y <- subx$new.cases.smooth*subx$day
  maxy <- max(subx$y, na.rm=TRUE)
  subx$y <- subx$y/maxy
  subrange <- seq(nrow(subx) %% sub, nrow(subx),sub)
  subrange <- subrange[subrange!=0]
  subx <- subx[subrange,]

  if(!fitit) return(list(x=x, subx=subx))

  for(s in c(0.05,.1,1)){
    for(m in seq(10,200,25)){
      if(fun=="f2") fit<-try(nls(y~f2(x,c(m,s,a)), data=subx, start=list(m=log(m), s=s, a=max(subx$y, na.rm=TRUE)), control = list(maxiter = 500)), silent=TRUE)
      if(fun=="f4") fit<-try(nls(y~f4(x,c(m,a), s.fix), data=subx, start=list(m=log(m), a=max(subx$y, na.rm=TRUE)), control = list(maxiter = 500)), silent=TRUE)
      if(!inherits(fit, "try-error")) break()
    }
    if(!inherits(fit, "try-error")) break()
  }
  if(inherits(fit, "try-error")) return(list(x=x, pred=NULL, fit=fit))
  tmp <- predict(fit, newdata=x)*maxy/x$day
  nNAs <- nrow(x)-length(tmp)
  # subx cuts off period where positive < posmin
  x$fitted <-c(rep(NA, nNAs), tmp)

  pred <- data.frame(
    day=max(x$day)+1:maxh,
    date=max(x$date)+1:maxh)
  pred$x <- log(pred$day)
  pred$pred <- predict(fit, newdata=pred)*maxy/pred$day

  cm1=NA; cm2=NA
  confval1 <- try(confint(fit, level=0.95), silent=TRUE)
  if(inherits(confval1, "try-error") || any(is.na(confval1)) ){ 
    confval <- try(getcis(fit.best=fit, subx=subx, level=0.05, sig=sig))
    if(!inherits(confval1, "try-error") && !inherits(confval, "try-error") && !any(any(is.na(confval1[,1])))) confval[,1] <- confval1[,1]
    if(!inherits(confval1, "try-error") && !inherits(confval, "try-error") && !any(any(is.na(confval1[,2])))) confval[,2] <- confval1[,2]
  }else{ confval <- confval1 }

  if(!inherits(confval, "try-error")){
    if(fun=="f2"){
      cm1 <- sum(f2(log(pred$day), confval[,1])*maxy/pred$day)
      cm2 <- sum(f2(log(pred$day), confval[,2])*maxy/pred$day)
      pred$pred.low <- f2(log(pred$day), confval[,1])*maxy/pred$day
      pred$pred.high <- f2(log(pred$day), confval[,2])*maxy/pred$day
      x$fitted.low <- f2(x$x, confval[,1])*maxy/x$day
      x$fitted.high <- f2(x$x, confval[,2])*maxy/x$day
    }else{
      cm1 <- sum(f4(log(pred$day), confval[,1], s.fix)*maxy/pred$day)
      cm2 <- sum(f4(log(pred$day), confval[,2], s.fix)*maxy/pred$day)
      pred$pred.low <- f4(log(pred$day), confval[,1], s.fix)*maxy/pred$day
      pred$pred.high <- f4(log(pred$day), confval[,2], s.fix)*maxy/pred$day
      x$fitted.low <- f4(x$x, confval[,1], s.fix)*maxy/x$day
      x$fitted.high <- f4(x$x, confval[,2], s.fix)*maxy/x$day
    }

  }

  return(list(x=x, subx=subx, pred=pred, fit=fit, confval=confval, cm=sort(c(cm1, cm2))))
}

myfun <- function(data, reg, regname="", posmin=100, maxh=200, mindate="2020-02-15", maxdate="2020-12-01", bad.reg=NULL, fun="f2", s.fix=0.08, sig=0.05, fitit=FALSE, fil=7, sub=4){
  mindate <- as.Date(mindate)
  maxdate <- as.Date(maxdate)
  library(dplyr)
  b <- data %>% 
      subset(region %in% reg) %>%
      dplyr::group_by(date) %>%
      dplyr::summarize_if(is.numeric, function(x){ifelse(all(is.na(x)), NA, sum(x, na.rm=TRUE))})
    # This will not have the region column, so we add that back on
  if(missing(regname)) regname <- paste(reg, collapse="+")
  b$region <- regname
  b <- b[!is.na(b$positive),]

  #b<-subset(data, region==reg)
  if(max(b$positive, na.rm=TRUE)<500) return()
  day1 <- min(which(b$positive>posmin))

  b$new.cases <- c(NA, diff(b$positive))
  b$new.deaths <- c(NA, diff(b$death))
  b$new.cases[b$new.cases<0] <- NA
  #if(identical(reg, "Hubei China")) b$new.cases[b$new.cases>10000]=NA #hack 
  #if(identical(reg, "France")) b$new.cases[b$new.cases>10000]=NA #hack
  #if(identical(reg, "Hungary")) b$new.cases[b$new.cases>200]=NA #hack
  cfr <- mean(b$death[(nrow(b)-5):nrow(b)]/b$positive[(nrow(b)-5-7):(nrow(b)-7)], na.rm=TRUE)

  fit <- getfit(b, posmin=posmin, maxh=maxh,
                fun=fun, s.fix=s.fix, sig=sig, fitit=fitit,
                fil=fil, sub=sub)
  if(!fitit || inherits(fit$fit, "try-error") || all(is.na(fit$confval))){
    b <- fit$x
    bm <- 1.05*max(b$new.cases, na.rm=TRUE)
    popmill <- round(sum(popdata$population[match(unique(c(reg, regname)), popdata$name)], na.rm=TRUE)/1e6, digits = 2)
    cfr <- mean(b$death[(nrow(b)-5):nrow(b)]/b$positive[(nrow(b)-5-7):(nrow(b)-7)], na.rm=TRUE)
    b2 <- subset(b, months(b$date) == "April")
    cfr.apr <- sum(b2$new.deaths[7:nrow(b2)],na.rm=TRUE)/sum(b2$new.cases[1:(nrow(b2)-7)], na.rm=TRUE)
    b2 <- subset(b, months(b$date) == "September")
    cfr.sep <- sum(b2$new.deaths[7:nrow(b2)],na.rm=TRUE)/sum(b2$new.cases[1:(nrow(b2)-7)], na.rm=TRUE)
    p <- ggplot(b, aes(x = date, y = new.cases.smooth)) + geom_point() +
      ggtitle(paste0(regname, "\n", "CFR Apr = ",
                    round(100*cfr.apr,digits=2), "  CFR Sep = ",
                    round(100*cfr.sep,digits=2))) +
      scale_x_date(limits = c(mindate, maxdate), date_breaks="1 month", 
                   date_labels="%b")
    #y=bm
    lab <- paste(
      "\n\nDeaths ", max(b$death, na.rm=TRUE), " ", 
      round(max(b$death, na.rm=TRUE)/(popmill*1000), digits=2), "per thousand\n",
      "population =", popmill, "million")
    p <- p + 
      annotate("text", x=mindate+1, y=Inf, label=lab, hjust=0)
    return(list(p=p, cfr=cfr))
  }

  # updated with fitted
  b <- fit$x
  pred <- fit$pred
  bm <- 1.05*max(b$new.cases.smooth, pred$pred, pred$pred.high, na.rm=TRUE)

  p <- ggplot(b, aes(x = date, y = new.cases.smooth)) + geom_point(col="grey") + 
    geom_point(data=fit$subx, col="blue") +
    geom_line(aes(x=date, y=fitted), data=b) +
    geom_line(aes(x=date, y=pred), data=pred, color="red") +
    ggtitle(regname) +
    scale_x_date(limits = c(mindate, maxdate), date_breaks="1 month", 
                 date_labels="%b")

  npos <- max(b$positive, na.rm=TRUE)+sum(pred$pred, na.rm=TRUE)
  ndeaths.p <- max(b$death, na.rm=TRUE)+
    sum(b$new.cases[(nrow(b)-7):nrow(b)], na.rm=TRUE)*cfr
  ndeaths <- ndeaths.p +
    sum(pred$pred, na.rm=TRUE)*cfr

  if(!inherits(fit$confval, "try-error") && !reg%in%bad.reg && !any(is.na(fit$confval))){
    #mode of log normal is 
    mode.lims <- sort(round(exp(fit$confval[1,]-fit$confval[2,]^2)))
    df <- data.frame(x=b$date[1]+day1+mode.lims-1, y=c(bm,bm))
    #p <- p + geom_line(data=df, aes(x=x, y=y), size=5, color="grey")
    if(fun=="f2") cfs <- coef(fit$fit)
    if(fun=="f4") cfs <- c(coef(fit$fit)[1], s.fix, coef(fit$fit)[2])
    popmill <- round(sum(popdata$population[match(unique(c(reg, regname)), popdata$name)], na.rm=TRUE)/1e6)
    p <- p +
      ggtitle(paste0(regname, "\nEst at Sept 1: Positives = ",
                    round(npos),
                    " Deaths = ", round(ndeaths), " (", 
                    round(ndeaths/(popmill*1000), digits=2), "/thousand)\n",
                    "Estimate range: Deaths low ", 
                    round(ndeaths.p+fit$cm[1]*cfr), 
                    " to Deaths high ", 
                    round(ndeaths.p+fit$cm[2]*cfr), "\n",
                    "CFR Est = ",
                    round(100*cfr,digits=2)))
    fit$pred$new.cases.smooth <- fit$pred$pred
    p <- p +
      geom_ribbon(data=fit$pred, aes(x=date, ymin = pred.low, ymax = pred.high), fill = "pink") +
      geom_line(data=fit$pred, aes(x=date, y=pred.low), linetype="dashed", color="red") +
      geom_line(data=fit$pred, aes(x=date, y=pred.high), linetype="dashed", color="red") + 
      geom_line(data=fit$pred, aes(x=date, y=pred), color="red")
    p <- p + annotate("text", x=mindate+1, y=Inf, label=paste("\nDeaths to date\n", max(b$death, na.rm=TRUE)," ", round(max(b$death, na.rm=TRUE)/(popmill*1000),digits=2), "per thousand"), hjust=0, vjust=1)


  }else{
    cfs <- coef(fit$fit)

    p <- p +
      ggtitle(paste(regname, "\nPoint estimates: Positives =", round(npos),
                    "Deaths =", round(ndeaths), "CFR =",
                    round(100*cfr,digits=2), "\n",
                    "s = ", round(cfs[2],3), "sig = ", round(summary(fit$fit)$sigma,3)))
    p <- p + annotate("text", x=mindate+1, y=bm, label=paste("\nDeaths to date\n", max(b$death, na.rm=TRUE)), hjust=0)

  }

  return(list(p=p, fit=fit$fit, confint=fit$confval, data=b, x=fit$x, subx=fit$subx, cfr=cfr))
}

worldplot <- function(reg, posmin=100, maxh=150, fitit=FALSE, ymax=1000){
#popmill <- round(sum(worldpop$PopTotal[match(reg, worldpop$Location)], na.rm=TRUE)/1e6, digits=2)
popmill <- round(sum(popdata$population[match(reg, popdata$name)], na.rm=TRUE)/1e6, digits=2)
p <- myfun(world, reg, posmin=posmin, maxh=maxh, fitit=fitit)
if(inherits(p, "list")){
  p$p <- p$p+ scale_y_continuous(sec.axis = sec_axis(~ ./popmill, name="new cases/million"), limits=c(0,ymax*popmill))
 p$p
}
}

stateplot <- function(reg, posmin=100, maxh=150, fitit=FALSE, data="states", ylims=c(0,1000)){
#regfull <- state.name[match(reg, state.abb)]
if(data=="states"){
  popmill <- round(sum(popdata$population[match(reg, popdata$name)], na.rm=TRUE)/1e6, digits=2)
  p <- myfun(states, reg, posmin=posmin, maxh=maxh, fitit=fitit)
}
if(data=="world"){
reg2 <- paste(c(state.name, "District of Columbia")[match(reg, c(state.abb, "DC"))], "US")
p <- myfun(world, reg2, regname=paste(reg, collapse="+"), posmin=posmin, maxh=maxh)
}
if(inherits(p, "list")){
  p$p <- p$p+geom_hline(yintercept = 750*popmill, col="red"); 
  p$p <- p$p+geom_hline(yintercept = 500*popmill, col="blue"); 
  p$p <- p$p + scale_y_continuous(sec.axis = sec_axis(~ ./popmill, name="new cases/million"), limits=popmill*ylims)
 # plot(p$p)
  p$p
}
}