inst/analysis-code/inc-per-mcmc.R

## incubation period MCMC for posterior distribution data
## Nicholas Reich
## January 2015

##' Runs MCMC for incubation period calculation on posterior data
##' 
##' Runs MCMC for a set of data (dat) that is in the form of samples from the posterior distribution.
##' Currently assumes that paramters require a exp() transformation.
##' 
##' @param dat data with columns as draws from the posterior, rows as observations
##' @param dist distribution, one of G, L, W
##' @param nsamp number of samples to use in the MCMC
##' @param burnin number of burnin samples to remove at end
##' @param nthin factor by which to thin by
##' @param theta0 a vector with starting values of the paramters (par1, par2), on proposal scale (i.e. log)
##' @param proposal_sd sd for the Gaussian proposal distribution
##' @param verbose whether to print results
##' @param which_sample if 0, the MCMC will sample from the posterior on each iteration; if an integer, the column number of the data to be used
inc_per_mcmc_on_posterior <- function(dat, dist, nsamp, burnin, nthin, theta0, proposal_sd, verbose=FALSE, which_sample=0) {
    if(!(which_sample%%1==0) | which_sample<0)
        stop("which_sample must be non-negative whole number")
    ## storage
    cnames <- c("par1", "par2", "accept")
    chain <- matrix(NA, nrow=nsamp, ncol=length(cnames))
    colnames(chain) <- cnames
    chain[1,1:2] <- theta0
    chain[1,"accept"] <- 0
    
    ## define parameter transformation function
    ## for dist = G or W exp of both 
    ## for dist = L, exp of just sdlog
    if(dist=="L"){
        par_trans <- function(x) c(x[1], exp(x[2]))
    } else {
        par_trans <- function(x) exp(x)
    }
                        
    for(i in 2:nsamp){
        current_pars <- chain[i-1,1:2]
        if(which_sample==0) {
            t <- dat[,sample(ncol(dat), 1)]
        } else {
            t <- dat[,which_sample]
        }
        proposal_pars <- current_pars + rnorm(2, mean=0, sd=proposal_sd)
        current_lik <- siclik(par_trans(current_pars), t, dist=dist)
        proposal_lik <- siclik(par_trans(proposal_pars), t, dist=dist)
        a <- min(proposal_lik/current_lik, 1)
        chain[i,"accept"] <- rbinom(1, 1, a)
        chain[i,1:2] <- chain[i, "accept"]*proposal_pars + (1-chain[i, "accept"])*current_pars
    }
    if(verbose) {
        layout(matrix(1:3, nrow=3))
        plot(chain[,1], type="l")
        plot(chain[,2], type="l")
        plot(chain[,1], chain[,2])
        print(paste("acceptance rate =", round(mean(chain[,"accept"]), 2)))
        #print(apply(chain[(burnin+1):nrow(chain),], MARGIN=2, FUN=mean))
    }
    idx <- seq(burnin+1, nrow(chain), by=nthin)
    chain <- chain[idx,]
    return(chain)
}

##' Runs MCMC for incubation period calculation
##' 
##' Runs MCMC for a set of data (dat) that is in the form of single interval censored data.
##' 
##' @param dat data with columns for min and max incubation periods rows as observations
##' @param ip_cols character vector of length 2 with names of the columns containing the minimum and maximum possible incubation periods per observation
##' @param dist distribution, one of G, L, W
##' @param nsamp number of samples to use in the MCMC
##' @param burnin number of burnin samples to remove at end
##' @param nthin factor by which to thin by
##' @param theta0 a vector with starting values of the paramters (par1, par2), on proposal scale (i.e. log)
##' @param proposal_sd sd for the Gaussian proposal distribution
##' @param verbose whether to print results
##' @param log_scale logical specifying whether to return likelihood on log-scale
inc_per_mcmc<- function(dat, ip_cols, dist, nsamp, burnin, nthin, theta0, 
                        proposal_sd, verbose=FALSE, log_scale=FALSE) {
    ## ensure that ip_cols values are in the column names of dat
    if(!(ip_cols[1] %in% colnames(dat)) | !(ip_cols[2] %in% colnames(dat)) )
        stop("ip_cols must be in colnames(dat)")
    
    ## storage
    cnames <- c("par1", "par2", "accept")
    chain <- matrix(NA, nrow=nsamp, ncol=length(cnames))
    colnames(chain) <- cnames
    chain[1,1:2] <- theta0
    chain[1,"accept"] <- 0
    
    ## define parameter transformation function
    ## for dist = G or W exp of both 
    ## for dist = L, exp of just sdlog
    if(dist=="L"){
        par_trans <- function(x) c(x[1], exp(x[2]))
    } else {
        par_trans <- function(x) exp(x)
    }
    
    for(i in 2:nsamp){
        current_pars <- chain[i-1,1:2]
        tmin <- dat[,ip_cols[1]]
        tmax <- dat[,ip_cols[2]]
        proposal_pars <- current_pars + rnorm(2, mean=0, sd=proposal_sd)
        current_lik <- siclik_two_val(par_trans(current_pars), 
                                      tmin=tmin, tmax=tmax, dist=dist,
                                      log_scale=log_scale)
        proposal_lik <- siclik_two_val(par_trans(proposal_pars), 
                                       tmin=tmin, tmax=tmax, dist=dist,
                                       log_scale=log_scale)
        a <- ifelse(log_scale,
                    min(exp(proposal_lik-current_lik), 1),
                    min(proposal_lik/current_lik, 1))
        chain[i,"accept"] <- rbinom(1, 1, a)
        chain[i,1:2] <- chain[i, "accept"]*proposal_pars + (1-chain[i, "accept"])*current_pars
    }
    if(verbose) {
        layout(matrix(1:3, nrow=3))
        plot(chain[,1], type="l")
        plot(chain[,2], type="l")
        plot(chain[,1], chain[,2])
        print(paste("acceptance rate =", round(mean(chain[,"accept"]), 2)))
        #print(apply(chain[(burnin+1):nrow(chain),], MARGIN=2, FUN=mean))
    }
    idx <- seq(burnin+1, nrow(chain), by=nthin)
    chain <- chain[idx,]
    return(chain)
}

##' calculates likelihood for single interval-censored data
##' 
##' We use a consistent (par1, par2) notation for each distribution, they
##' map in the following manner: \deqn{Log-normal(meanlog=par1, sdlog=par2)}
##' \deqn{Gamma(shape=par1, scale=par2)} \deqn{Weibull(shape=par1, scale=par2)}
##' 
##' @param pars a vector with pararmeter 1 and parameter 2 of distribution (see details)
##' @param t assumed SIC observation(s) of the incubation period (t, t+1)
##' @param dist single charachter specifying distribution to use: 
##'     W=weibull, G=gamma, L=log-normal
##' 
siclik <- function(pars, t, dist){
    par1 <- pars[1]
    par2 <- pars[2]
    ## calculates the SIC likelihood as the difference in CDFs
    if (dist =="W"){
        prod(pweibull(t+1, shape=par1, scale=par2) - pweibull(t, shape=par1, scale=par2))
    } else if (dist =="G") {
        prod(pgamma(t+1, shape=par1, scale=par2) - pgamma(t, shape=par1, scale=par2))
    } else if (dist == "L"){
        prod(plnorm(t+1, meanlog=par1, sdlog=par2) - plnorm(t, meanlog=par1, sdlog=par2))
    } else {
        stop("distribution not supported")
    }
}

##' calculates likelihood for single interval-censored data
##' 
##' We use a consistent (par1, par2) notation for each distribution, they
##' map in the following manner: \deqn{Log-normal(meanlog=par1, sdlog=par2)}
##' \deqn{Gamma(shape=par1, scale=par2)} \deqn{Weibull(shape=par1, scale=par2)}
##' 
##' @param pars a vector with pararmeter 1 and parameter 2 of distribution (see details)
##' @param tmin minimum value(s) of the incubation period
##' @param tmax maximum value(s) of the incubation period
##' @param dist single charachter specifying distribution to use: 
##'     W=weibull, G=gamma, L=log-normal
##' @param log_scale logical specifying whether to return likelihood on log-scale
##' 
siclik_two_val <- function(pars, tmin, tmax, dist, log_scale=FALSE){
    par1 <- pars[1]
    par2 <- pars[2]
    ## calculates the SIC likelihood as the difference in CDFs
    if (dist =="W"){
        liks <- pweibull(tmax, shape=par1, scale=par2) - pweibull(tmin, shape=par1, scale=par2)
    } else if (dist =="G") {
        liks <- pgamma(tmax, shape=par1, scale=par2) - pgamma(tmin, shape=par1, scale=par2)
    } else if (dist == "L"){
        liks <- plnorm(tmax, meanlog=par1, sdlog=par2) - plnorm(tmin, meanlog=par1, sdlog=par2)
    } else {
        stop("distribution not supported")
    }
    
    ## calculate lik on log scale or not
    if(log_scale) {
        lik <- sum(log(liks))
    } else {
        lik <- prod(liks)
    }
    return(lik)
}


##' calculate Rhat for each parameter
##' 
##' @param chain the output from inc_per_mcmc
get_Rhat <- function(chains) {
    n <- nrow(chains[chains$chain==1,])
    m <- max(chains$chain)
    ## for shape
    mean_shape <- mean(chains$shape)
    wcv_shp <- mean(tapply(chains$shape, INDEX = list(chain=chains$chain), FUN=var))
    bcv_shp <- n / (m-1) * sum(tapply(chains$shape, INDEX = list(chain=chains$chain), 
                                  FUN = function(x) (mean(x)-mean_shape)^2))
    var_shp <- (1-1/n)*wcv_shp + 1/n*bcv_shp
    Rhat_shp <- sqrt(var_shp/wcv_shp)
    ## for scale
    mean_scl <- mean(chains$scale)
    wcv_scl <- mean(tapply(chains$scale, INDEX = list(chain=chains$chain), FUN=var))
    bcv_scl <- n / (m-1) * sum(tapply(chains$scale, INDEX = list(chain=chains$chain), 
                                      FUN = function(x) (mean(x)-mean_scl)^2))
    var_scl <- (1-1/n)*wcv_scl + 1/n*bcv_scl
    Rhat_scl <- sqrt(var_scl/wcv_scl)
    return(c(Rhat_scl=Rhat_scl, Rhat_shp=Rhat_shp))
}


##' make posterior likelihood plot
##' 
##' @param chain the output from inc_per_mcmc
##' @param dist the distribution assumed for the model output.
plot_posterior <- function(chain, dist) {
    require(coda)
    require(ggplot2)
    chain <- data.frame(chain)
    if(dist == "L") {
        chain[,1:2] <- cbind(exp(chain[,1]), exp(exp(chain[,2])))
        xlab_text <- "median"
        ylab_text <- "dispersion"
    } else {
        chain[,1:2] <- exp(chain[,1:2])
        xlab_text <- "shape"
        ylab_text <- "scale"
    }
    par1_median <- median(chain$par1)
    par2_median <- median(chain$par2)
    
    ## get contour line data
    nbin=100
    p1 <- seq(min(chain$par1), max(chain$par1), length.out=nbin)
    p2 <- seq(min(chain$par2), max(chain$par2), length.out=nbin)
    ptiles <- expand.grid(p1, p2)
    colnames(ptiles) <- c("p1", "p2")
    ptiles$p95 <- switch(dist,
                         W=qweibull(.95, shape=ptiles$p1, scale=ptiles$p2),
                         G=qgamma(.95, shape=ptiles$p1, scale=ptiles$p2),
                         L=qlnorm(.95, meanlog=log(ptiles$p1), log(sdlog=ptiles$p2)))
    
    p <- ggplot(chain, aes(x = par1, y = par2)) +
        #stat_density2d(geom="tile", aes(fill = ..density..), contour = FALSE) +
        geom_point(alpha=I(.01)) +
        stat_ellipse(color="red", level=0.90)+
        stat_ellipse(color="red", level=0.95)+
        stat_contour(data=ptiles, aes(x=p1, y=p2, z=p95), breaks=c(21), color="blue") +
        annotate("point", x=par1_median, y=par2_median, shape=3, color="red") +
        theme_bw() + xlab(xlab_text) + ylab(ylab_text)
    print(p)
}

##' make posterior likelihood plot for gamma distribution
##' 
##' @param mcmc_df the rbound output from inc_per_mcmc
plot_gamma_posterior <- function(mcmc_df, H, p95.breaks=c(10, 15, 21)) {
    require(ggplot2)
    require(ks)
    require(reshape2)
    require(dplyr)
    shape_median <- median(mcmc_df$shape)
    scale_median <- median(mcmc_df$scale)
    
    ## get contour line data for p95 lines
    nbin=100
    p1 <- seq(min(mcmc_df$shape), max(mcmc_df$shape), length.out=nbin)
    p2 <- seq(min(mcmc_df$scale), max(mcmc_df$scale), length.out=nbin)
    ptiles <- expand.grid(p1, p2)
    colnames(ptiles) <- c("p1", "p2")
    ptiles$p95 <- qgamma(.95, shape=ptiles$p1, scale=ptiles$p2)
    
    ## get contour line data for KDE percentiles
    if(nrow(mcmc_df)>10000) {
        idx <- sample(1:nrow(mcmc_df), size=10000, replace=FALSE)
    } else {
        idx <- sample(1:nrow(mcmc_df), size=nrow(mcmc_df), replace=FALSE)
    }
    x <- mcmc_df[idx,1:2]
    fhat <- kde(x, H=H, compute.cont=TRUE)
    dimnames(fhat[['estimate']]) <- list(fhat[["eval.points"]][[1]], 
                                         fhat[["eval.points"]][[2]])
    contour_df <- melt(fhat[['estimate']])
    
    if(nrow(mcmc_df)>5000) {
        mcmc_df <- sample_n(mcmc_df, size=5000)
    }
    ## make plot
    p <- ggplot(mcmc_df, aes(x = shape, y = scale)) +
        geom_point(alpha=I(.05)) +
        stat_contour(data=ptiles, aes(x=p1, y=p2, z=p95, colour=..level..), breaks=p95.breaks) +
        annotate("point", x=shape_median, y=scale_median, shape=3, color="red") +
        geom_contour(aes(x=Var1, y=Var2, z=value), data=contour_df, 
                     breaks=fhat[["cont"]]["5%"], color="darkred") + 
        geom_contour(aes(x=Var1, y=Var2, z=value), data=contour_df, 
                     breaks=fhat[["cont"]]["10%"], color="red") +
        scale_x_log10() + scale_y_log10() +
        theme_bw() + xlab("shape") + ylab("scale")
    print(p)
    return(p)
}

##' make posterior likelihood plot for gamma distribution
##' 
##' @param mcmc_dfs the rbound outputs from inc_per_mcmc
##' @param Hs bandwidths matrices from inc_per_mcmc
##' @param p95.breaks contours to show of 95th percentiles
##' @params n_points max number of points to include in the KDE
plot_credible_regions <- function(mcmc_dfs, Hs, p95.breaks=c(10, 15, 21), n_points=10000) {
    require(ggplot2)
    require(ks)
    require(reshape2)
    require(dplyr)
    #shape_median <- median(mcmc_df$shape)
    #scale_median <- median(mcmc_df$scale)
    
    ## set plot range
    shape_range <- c(2.3, 47)
    scale_range <- c(.27, 4.5)
    
    ## colors
    colors <- c("#1b9e77", "#d95f02", "#7570b3")
    
    ## get contour line data for p95 lines
    nbin=100
    p1 <- seq(shape_range[1], shape_range[2], length.out=nbin)
    p2 <- seq(scale_range[1], scale_range[2], length.out=nbin)
    ptiles <- expand.grid(p1, p2)
    colnames(ptiles) <- c("p1", "p2")
    ptiles$p95 <- qgamma(.95, shape=ptiles$p1, scale=ptiles$p2)
    
    ## get contour line data for KDE percentiles
    contour_dfs <- vector("list", length(mcmc_dfs))
    contour_levels <- rep(0, length(mcmc_dfs))
    max_size <- n_points
    for(i in 1:length(mcmc_dfs)){
        if(nrow(mcmc_dfs[[i]])>max_size) {
            idx <- sample(1:nrow(mcmc_dfs[[i]]), size=max_size, replace=FALSE)
        } else {
            idx <- sample(1:nrow(mcmc_dfs[[i]]), size=nrow(mcmc_dfs[[i]]), replace=FALSE)
        }
        x <- mcmc_dfs[[i]][idx,1:2]
        fhat <- kde(x, H=Hs[[i]], compute.cont=TRUE)
        dimnames(fhat[['estimate']]) <- list(fhat[["eval.points"]][[1]], 
                                             fhat[["eval.points"]][[2]])
        contour_dfs[[i]] <- melt(fhat[['estimate']])
        contour_levels[i] <- fhat[["cont"]]["5%"]
    }
    
    ## make plot
    p <- ggplot(ptiles) +
        stat_contour(aes(x=p1, y=p2, z=p95), breaks=p95.breaks) +
        #annotate("point", x=shape_median, y=scale_median, shape=3, color="red") 
        scale_x_log10(breaks=c(2:5, 20, 40)) + scale_y_log10(breaks=c(.2, .5, 1:4)) +
        theme_bw() + xlab("shape") + ylab("scale")
    for(i in 1:length(mcmc_dfs)) {
        p <- p + stat_contour(aes(x=Var1, y=Var2, z=value), data=contour_dfs[[i]], 
                              breaks=contour_levels[i], geom="polygon", 
                              fill=colors[i], alpha=.6) 
    }
    print(p)
    return(p)
}


##' make posterior likelihood plot for gamma distribution
##' 
##' @param mcmc_dfs the rbound outputs from inc_per_mcmc
##' @param kdes list of results from call to fit_kde
##' @param label_txt labels for points
##' @param colors colors for each region
##' @param show.legend logical, passed to geom_point
##' @param base.size size for font, to be passed to theme_bw()
plot_modified_credible_regions <- function(mcmc_dfs, kdes, label_txt, colors, show.legend=FALSE, base.size=12) {
    require(ggplot2)
    require(reshape2)
    require(dplyr)
    
    ## set credible region data for KDE percentiles
    pstr_median <- data.frame(median=rep(NA, length(mcmc_dfs)), 
                              p95=rep(NA, length(mcmc_dfs)),
                              disease = factor(label_txt, 
                                               levels=label_txt, ordered=TRUE))
    
    ## make plot
    p <- ggplot() +  theme_bw() + xlab("median incubation period (days)") + ylab("95th percentile of incubation period (days)")
    for(i in 1:length(mcmc_dfs)) {
        pstr_median[i,"median"] <- median(mcmc_dfs[[i]]$median)
        pstr_median[i,"p95"] <- median(mcmc_dfs[[i]]$p95)
        p <- p + stat_contour(aes(x=Var1, y=Var2, z=value), 
                              data=kdes[[i]][["contour_df"]], 
                              breaks=kdes[[i]][["contour_level"]], 
                              geom="polygon", 
                              fill=colors[i], alpha=.6) 
    }
    p <- p + geom_point(data=pstr_median, aes(x=median, y=p95, color=disease),
                        show.legend = show.legend) +
        theme_bw(base_size = base.size) + 
        theme(legend.title=element_blank(),
              legend.position=c(1,1), legend.justification=c(1,1)) +
        scale_color_manual(values=colors) 
    print(p)
    return(p)
}

## fit kde for use in smoothing
fit_kde <- function(mcmc_df, H, max_size) {
    require(reshape2)
    if(nrow(mcmc_df)>max_size) {
        idx <- sample(1:nrow(mcmc_df), size=max_size, replace=FALSE)
    } else {
        idx <- sample(1:nrow(mcmc_df), size=nrow(mcmc_df), replace=FALSE)
    }
    ## calculate median and p95
    x <- mcmc_df[idx, c("median", "p95")]
    fhat <- kde(x, H=H, compute.cont=TRUE)
    dimnames(fhat[['estimate']]) <- list(fhat[["eval.points"]][[1]], 
                                         fhat[["eval.points"]][[2]])
    contour_df <- melt(fhat[['estimate']])
    contour_level <- fhat[["cont"]]["5%"]
    return(list(contour_df=contour_df, contour_level=contour_level))
}
    
    

## plot densities
plot_gamma_densities <- function(mcmc_df, kde, n_to_plot, to=30, 
                                 plotcolor, plotcolor_light, label_txt, xaxs_lab=TRUE) {
    median_shape <- median(mcmc_df$shape)
    median_scale <- median(mcmc_df$scale)
    
    ## idx
    contour_df <- kde[["contour_df"]]
    credible_region_idx <- which(contour_df$value>kde[["contour_level"]])
    marginal_var1_ci <- range(contour_df$Var1[credible_region_idx])
    marginal_var2_ci <- range(contour_df$Var2[credible_region_idx])
    
    ## plot the first one
    xgam <- seq(0, to, by=.1)
    gam_dat <- data_frame(xgam, 
                          Fgam=dgamma(xgam, 
                                      shape=median_shape,
                                      scale=median_scale))
    p <- ggplot() + geom_line(data=gam_dat, aes(x=xgam, y=Fgam), color=plotcolor) +
        annotate("rect", 
                 xmin=marginal_var1_ci[1], xmax=marginal_var1_ci[2], 
                 ymin=-10, ymax=10,
                 alpha=.2) +
        annotate("rect", 
                 xmin=marginal_var2_ci[1], xmax=marginal_var2_ci[2], 
                 ymin=-10, ymax=10,
                 alpha=.2) 
    
    ## add samples
    for(i in 1:n_to_plot){
        shp <- as.numeric(mcmc_df[i,"shape"])
        scl <- as.numeric(mcmc_df[i,"scale"])
        gam_dat <- data_frame(xgam, 
                              Fgam=dgamma(xgam, shape=shp,
                                          scale=scl))
        p <- p + geom_line(data=gam_dat, aes(x=xgam, y=Fgam), 
                           color=plotcolor_light, alpha=I(100/n_to_plot))
    }
    # reimprint median
    gam_dat <- data_frame(xgam, 
                          Fgam=dgamma(xgam, shape=median_shape,
                                      scale=median_scale))
    p <- p + geom_line(data=gam_dat, aes(x=xgam, y=Fgam), color=plotcolor, size=1) +
        geom_text(aes(x=to-5, y=.2, label=label_txt)) +
        ylab("density") + coord_cartesian(ylim=c(0, .225))
    if(xaxs_lab) {
        p <- p + xlab("duration of incubation period (days)")
    } else {
        p <- p + xlab(NULL)
    }
    print(p)
    return(p)
}

##' plot the distribution with MLE parameters
##' 
##' @param chain the output from inc_per_mcmc
##' @param dist the distribution assumed for the model output.
plot_ml_dist <- function(chain, dist, from=0, to=25) {
    par1_median <- exp(median(chain[,"par1"]))
    par2_median <- exp(median(chain[,"par2"]))
    x <- seq(from, to, length.out=1000)
    y <- switch(dist,
                W=dweibull(x, shape=par1_median, scale=par2_median),
                G=dgamma(x, shape=par1_median, scale=par2_median),
                L=dlnorm(x, meanlog=log(par1_median), sdlog=par2_median))
    par(mar=c(2,5,1,1))
    plot(x, y, las=1, type="l", bty="n",
         ylab="density", xlab="incubation period (in days)")
}


## from http://stackoverflow.com/questions/23437000/how-to-plot-a-contour-line-showing-where-95-of-values-fall-within-in-r-and-in
## not used anymore, superceded by stat_ellipse
getLevel <- function(x,y,prob=0.95) {
    kk <- MASS::kde2d(x,y)
    dx <- diff(kk$x[1:2])
    dy <- diff(kk$y[1:2])
    sz <- sort(kk$z)
    c1 <- cumsum(sz) * dx * dy
    approx(c1, sz, xout = 1 - prob)$y
}


#' Calculate robust bandwidths for plotting kernel density contours
#'
#' @param params output from inc_per_mcmc
#'
#' @return a matrix of bandwidths
get_robust_bandwidths <- function(params, cols=c("shape", "scale")) {
    require(ks)
    x <- params[, cols]
    Hscv(x)
}
reichlab/activemonitr documentation built on April 9, 2024, 2:17 p.m.