baker: Bayesian Analysis Kit for Etiology Research

Documented in plot_etiology_regression plot_etiology_strat

#' visualize the etiology regression with a continuous covariate
#' 
#' This function visualizes the etiology regression against one continuous covariates, e.g., 
#' enrollment date. (NB: dealing with NoA, multiple-pathogen causes, other continuous covariates?)
#' 
#' @param DIR_NPLCM File path to the folder containing posterior samples
#' @param stratum_bool a vector of TRUE/FALSE with TRUE indicating the rows of subjects to include
#' @param plot_basis TRUE for plotting basis functions; Default to FALSE
#' @param truth a list of truths computed from true parameters in simulations; elements: Eti, FPR, PR
#'  which are matrices of # of rows = # of subjects, # columns: \code{length(cause_list)} for Eti
#'  and \code{ncol(data_nplcm$Mobs$MBS$MBS1)}; Default to \code{NULL} for real data analyses.
#'
#' @return A figure of etiology regression curves and some marginal positive rate assessment of
#' model fit; See example for the legends.
#' 
#' @import graphics
#' @examples 
#' \dontrun{
#' # legend.text = c("[UPPER FIGURES]",
#'                 "observed prevalence: cases",
#'                 "observed prevalence: controls",
#'                 "fitted prevalence: cases",
#'                 "fitted prevalence: controls",
#'                 "true positive rate: mean",
#'                 "true positive rate: 95%CI",
#'                 "[BOTTOM FIGURES]",
#'                 "etiology curve: mean",
#'                 "overall etiology: mean",
#'                 "overall etiology: 95%CI","","","")
#' legend.col=c("white","black","dodgerblue2","black","dodgerblue2","red","red",
#'              "white","springgreen4","orange","orange","white","white","white")
#' legend.lty=c(1,2,2,1,1,1,2,1,1,1,2)
#' legend.lwd=c(2,2,2,2,2,2,2,2,2,2,2,2,2,2)
#' legend("topleft",legend=legend.text,
#'        lty=legend.lty,lwd=legend.lwd,
#'        col=legend.col,ncol=2,
#'        y.intersp=1.5,cex=1.6,box.col=NA)
#'        }
#' @references See example figures:
#' \url{https://github.com/zhenkewu/baker/blob/master/inst/figs/visualize_etiology_regression_SITE=1.pdf}
#' and its legends: 
#' \url{https://github.com/zhenkewu/baker/blob/master/inst/figs/legends_visualize_etiology_regression.png}
#' @family visualization functions
#' @export
plot_etiology_regression <- function(DIR_NPLCM,stratum_bool,plot_basis=FALSE,truth=NULL){
  old_par <- graphics::par(graphics::par("mfrow", "mar"))
  on.exit(graphics::par(old_par))
  if (!is_jags_folder(DIR_NPLCM)){
    stop("==[baker] oops, not a folder baker recognizes. 
         Try a folder generated by baker, e.g., a temporary folder?==")
  }
  # JAGS:
  #
  # Read data from DIR_NPLCM:
  #
  data_nplcm <- dget(file.path(DIR_NPLCM,"data_nplcm.txt"))  
  
  new_env <- new.env()
  source(file.path(DIR_NPLCM,"jagsdata.txt"),local=new_env)
  bugs.dat <- as.list(new_env)
  rm(new_env)
  res_nplcm <- coda::read.coda(file.path(DIR_NPLCM,"CODAchain1.txt"),
                               file.path(DIR_NPLCM,"CODAindex.txt"),
                               quiet=TRUE)
  print_res <- function(x) plot(res_nplcm[,grep(x,colnames(res_nplcm))])
  get_res   <- function(x) res_nplcm[,grep(x,colnames(res_nplcm))]
  
  # structure the posterior samples:
  ncol_dm_FPR_1 <- ncol(bugs.dat$Z_FPR_1)
  JBrS_1        <- ncol(bugs.dat$MBS_1)
  n_samp_kept   <- nrow(res_nplcm)
  ncol_dm_Eti   <- ncol(bugs.dat$Z_Eti)
  Jcause        <- bugs.dat$Jcause
  Nd            <- bugs.dat$Nd
  Nu            <- bugs.dat$Nu
  
  betaFPR_samp <- array(t(get_res("betaFPR_1")),c(ncol_dm_FPR_1,JBrS_1,n_samp_kept))
  betaEti_samp <- array(t(get_res("betaEti")),c(ncol_dm_Eti,Jcause,n_samp_kept))
  thetaBS_1_samp <- get_res("thetaBS_1")
  linpred      <- function(beta,design_matrix){design_matrix%*%beta}
  
  out_FPR_linpred     <- array(apply(betaFPR_samp,3,linpred,design_matrix=bugs.dat$Z_FPR_1),
                               c(Nd+Nu,JBrS_1,n_samp_kept))
  out_Eti_linpred     <- array(apply(betaEti_samp,3,linpred,design_matrix=bugs.dat$Z_Eti),
                               c(Nd,Jcause,n_samp_kept))
  #
  # 2. use this code if date is included in etiology and false positive regressions:
  #
  # false positive rates:
  subset_FPR_ctrl     <- data_nplcm$Y==0 & stratum_bool # <--- specifies who to look at.
  plotid_FPR_ctrl     <- which(subset_FPR_ctrl)[order(data_nplcm$X$std_date[subset_FPR_ctrl])]
  curr_date_FPR       <- data_nplcm$X$std_date[plotid_FPR_ctrl]
  FPR_prob_scale      <- expit(out_FPR_linpred[plotid_FPR_ctrl,,])
  FPR_mean <- apply(FPR_prob_scale,c(1,2),mean)
  FPR_q    <- apply(FPR_prob_scale,c(1,2),quantile,c(0.025,0.975))
  
  # positive rates for cases:
  fitted_margin_case <- function(pEti_ord,theta,psi,template){
    mixture <-  pEti_ord
    tpr     <-  t(t(template)*theta)
    fpr     <- t(t(1-template)*psi)
    colSums(tpr*mixture + fpr*mixture)
  }
  
  subset_FPR_case          <- data_nplcm$Y==1 & stratum_bool # <--- specifies who to look at.
  plotid_FPR_case          <- which(subset_FPR_case)[order(data_nplcm$X$std_date[subset_FPR_case])]
  curr_date_FPR_case       <- data_nplcm$X$std_date[plotid_FPR_case]
  FPR_prob_scale_case      <- expit(out_FPR_linpred[plotid_FPR_case,,])
  
  # etiology:
  subset_Eti <- data_nplcm$Y==1 & stratum_bool # <--- specifies who to look at.
  plotid_Eti <- which(subset_Eti)[order(data_nplcm$X$std_date[subset_Eti])]
  curr_date_Eti  <- data_nplcm$X$std_date[plotid_Eti]
  
  Eti_prob_scale <- apply(out_Eti_linpred[plotid_Eti,,],c(1,3),softmax)
  Eti_mean <- apply(Eti_prob_scale,c(1,2),mean)
  Eti_q    <- apply(Eti_prob_scale,c(1,2),quantile,c(0.025,0.975))
  Eti_overall <- apply(Eti_prob_scale,c(1,3),mean)
  Eti_overall_mean <- rowMeans(Eti_overall)
  Eti_overall_q    <- apply(Eti_overall,1,quantile,c(0.025,0.975))
  
  PR_case <- array(NA,c(length(plotid_Eti),JBrS_1,n_samp_kept))
  for (i in 1:(length(plotid_Eti))){
    for (t in 1:n_samp_kept){
      PR_case[i,,t] <- fitted_margin_case(Eti_prob_scale[,i,t],
                                          thetaBS_1_samp[t,],
                                          FPR_prob_scale_case[i,,t],
                                          bugs.dat$templateBS_1[1:Jcause,]
      )
    }
  }
  
  PR_case_mean <- apply(PR_case,c(1,2),mean)
  PR_case_q <- apply(PR_case,c(1,2),quantile,c(0.025,0.975))
  
  ##################
  # plot results:
  #################
  par(mfcol=c(2,Jcause),oma=c(3,0,3,0))
  for (j in 1:Jcause){ # <--- the marginal dimension of measurements.
    # need to fix this for NoA! <------------------------ FIX!
    #
    # Figure 1 for case and control positive rates:
    #
    par(mar=c(2,5,0,1))
    {                                          #<------------------------ FIX!
      # if (j == Jcause){
      #   plot(0,0.5,type="l",ylim=c(0,1),pch="n",
      #        xaxt="n",xlab="",ylab="positive rate",las=2,bty="n")
      #   
      #   mtext("other",side = 3,cex=1.5,line=1)
      # } else{                                  #<------------------------ FIX!
      plot(curr_date_FPR,FPR_mean[,j],type="l",ylim=c(0,1),
           xaxt="n",xlab="",ylab="positive rate",las=2,bty="n")
      polygon(c(curr_date_FPR, rev(curr_date_FPR)),
              c(FPR_q[1,,j], rev(FPR_q[2,,j])),
              col = grDevices::rgb(0, 1, 1,0.5),border = NA)
      # rug plot:
      rug(curr_date_FPR[data_nplcm$Mobs$MBS$MBS1[plotid_FPR_ctrl,j]==1],side=3)
      rug(curr_date_FPR[data_nplcm$Mobs$MBS$MBS1[plotid_FPR_ctrl,j]==0],side=1)
      
      if(!is.null(truth$FPR)){lines(curr_date_FPR,truth$FPR[plotid_FPR_ctrl,j],col="blue",lwd=3)}
      
      if(plot_basis){matplot(curr_date_FPR,bugs.dat$Z_FPR_1[plotid_FPR_ctrl,],col="blue",type="l",add=TRUE)}
      
      mtext(names(data_nplcm$Mobs$MBS[[1]])[j],side = 3,cex=1.5,line=1)
      
      points(curr_date_FPR_case,PR_case_mean[,j],type="l",ylim=c(0,1))
      polygon(c(curr_date_FPR_case, rev(curr_date_FPR_case)),
              c(PR_case_q[1,,j], rev(PR_case_q[2,,j])),
              col =  grDevices::rgb(1, 0, 0,0.5),border = NA)
      if(!is.null(truth$PR)){lines(curr_date_FPR_case,truth$PR[plotid_FPR_case,j],col="black",lwd=3)}
      # rug plot:
      rug(curr_date_FPR_case[data_nplcm$Mobs$MBS$MBS1[plotid_FPR_case,j]==1],side=3,col="dodgerblue2",line=-1)
      rug(curr_date_FPR_case[data_nplcm$Mobs$MBS$MBS1[plotid_FPR_case,j]==0],side=1,col="dodgerblue2",line=-1)
      
      abline(h=colMeans(thetaBS_1_samp)[j],col="red")
      abline(h=quantile(thetaBS_1_samp[,j],0.025),col="red",lty=2)
      abline(h=quantile(thetaBS_1_samp[,j],0.975),col="red",lty=2)
      
      # add raw moving average dots:
      ma <- function(x,n=60){stats::filter(x,rep(1/n,n), sides=2)}
      
      ma_cont <- function(y,x,hw=0.35){
        res <- rep(NA,length(y))
        for (i in seq_along(y)){
          res[i] <- mean(y[which(x>=x[i]-hw & x<=x[i]+hw)])
        }
        res
      }
      response.ctrl <- bugs.dat$MBS_1[plotid_FPR_ctrl,j]
      dat_ctrl <- data.frame(std_date=data_nplcm$X$std_date[plotid_FPR_ctrl])[!is.na(response.ctrl),,drop=FALSE]
      dat_ctrl$runmean <- ma_cont(response.ctrl[!is.na(response.ctrl)],dat_ctrl$std_date[!is.na(response.ctrl)])
      points(runmean ~ std_date,data=dat_ctrl[!is.na(response.ctrl),],lty=2,pch=1,cex=0.5,type="o",col="dodgerblue2")
      
      response.case <- bugs.dat$MBS_1[plotid_FPR_case,j]
      dat_case <- data.frame(std_date=data_nplcm$X$std_date[plotid_FPR_case])[!is.na(response.case),,drop=FALSE]
      dat_case$runmean <- ma_cont(response.case[!is.na(response.case)],dat_case$std_date[!is.na(response.case)])
      points(runmean ~ std_date,data=dat_case,lty=2,pch=1,cex=0.5,type="o")
    }
    #
    # Figure 2 for Etiology Regression:
    #
    par(mar=c(2,5,0,1))
    plot(curr_date_Eti,Eti_mean[j,],type="l",ylim=c(0,1),xlab="standardized date",
         ylab="etiologic fraction",bty="n",xaxt="n",yaxt="n",las=2)
    ## ONLY FOR SIMULATIONS <---------------------- FIX!
    if(!is.null(truth$Eti)){points(curr_date_Eti,truth$Eti[plotid_Eti,j],type="l",lwd=3,col="black")}
    if(plot_basis){matplot(curr_date_Eti,bugs.dat$Z_Eti[plotid_Eti,],col="blue",type="l",add=TRUE)}
    
    # overall pie:
    abline(h=Eti_overall_mean[j],col="black",lwd=2)
    abline(h=Eti_overall_q[,j],col="black",lty=2,lwd=1.5)
    mtext("Overall Pie",side=3,line=-0.5,cex=1.2)
    mtext(paste0(round(Eti_overall_mean[j],3)*100,"%"),side=3,line=-2,cex=1.2)
    mtext(paste0(round(Eti_overall_q[1,j],3)*100,"%"),side=3,line=-2.5,cex=1,adj=0.15)
    mtext(paste0(round(Eti_overall_q[2,j],3)*100,"%"),side=3,line=-2.5,cex=1,adj=0.85)
    
    # add x-axis for dates:
    X <- data_nplcm$X
    Y <- data_nplcm$Y
    # some date transformations:
    X$date_plot  <- as.Date(X$ENRLDATE)
    X$date_month_centered <- as.Date(cut(X$date_plot,breaks="2 months"))+30
    X$date_month <- as.Date(cut(X$date_plot,breaks="2 months"))
    
    color2 <- grDevices::rgb(190, 190, 190, alpha=200, maxColorValue=255)
    color1 <- grDevices::rgb(216,191,216, alpha=200, maxColorValue=255)
    #cases:
    last_interval <- max(X$date_month)
    lubridate::month(last_interval) <- lubridate::month(last_interval) +2
    axis(1, X$std_date[c(plotid_FPR_case)], 
         format(c(X$date_month[c(plotid_FPR_case)]), "%Y %b"), 
         cex.axis = .7,las=1)
    axis(2,at = seq(0,1,by=0.2),labels=seq(0,1,by=0.2),las=2)
    
    polygon(c(curr_date_Eti, rev(curr_date_Eti)),
            c(Eti_q[1,j,], rev(Eti_q[2,j,])),
            col = grDevices::rgb(0.5,0.5,0.5,0.5),border = NA)
  }
} 


#' visualize the etiology estimates for each discrete levels
#' 
#' This function visualizes the etiology estimates against one discrete covariate, e.g., 
#' age groups. 
#' 
#' @param DIR_NPLCM File path to the folder containing posterior samples
#' @param strata_weights a vector of weights that sum to one; for each pathogen
#' the weights specify how the j-th etiology fraction should be combined across all
#' levels of the discrete predictors in the data
#' @param truth a list of true values, e.g., 
#' \code{truth=list(allEti = a list of etiology fractions)}
#' 
#' @import graphics
#' @family visualization functions
#' @export
plot_etiology_strat <- function(DIR_NPLCM,strata_weights=NULL,truth=NULL){
  old_par <- graphics::par(graphics::par("mfrow", "mar"))
  on.exit(graphics::par(old_par))
  if (!is_jags_folder(DIR_NPLCM)){
    stop("==[baker] oops, not a folder baker recognizes. 
         Try a folder generated by baker, e.g., a temporary folder?==")
  }
  #
  # Read data from DIR_NPLCM:
  #
  data_nplcm <- dget(file.path(DIR_NPLCM,"data_nplcm.txt"))  
  model_options <- dget(file.path(DIR_NPLCM,"model_options.txt"))  
  
  new_env <- new.env()
  source(file.path(DIR_NPLCM,"jagsdata.txt"),local=new_env)
  bugs.dat <- as.list(new_env)
  rm(new_env)
  res_nplcm <- coda::read.coda(file.path(DIR_NPLCM,"CODAchain1.txt"),
                               file.path(DIR_NPLCM,"CODAindex.txt"),
                               quiet=TRUE)
  print_res <- function(x) plot(res_nplcm[,grep(x,colnames(res_nplcm))])
  get_res   <- function(x) res_nplcm[,grep(x,colnames(res_nplcm))]
  
  # structure the posterior samples:
  JBrS_1        <- ncol(bugs.dat$MBS_1) # number of pathogens in MBS1
  n_samp_kept   <- nrow(res_nplcm)      # number of posterior sample after burn-in
  Jcause        <- bugs.dat$Jcause      # number of causes
  Nd            <- bugs.dat$Nd          # case size
  Nu            <- bugs.dat$Nu          # control size
  n_unique_Eti_level <- bugs.dat$n_unique_Eti_level  # number of stratums
  
  # etiology:
  plotid_Eti <- which(data_nplcm$Y==1) # <--- specifies who to look at.
  Eti_prob_scale <- array(get_res("pEti"),c(n_samp_kept,n_unique_Eti_level,Jcause))
  
  # posterior etiology mean for each cause for each site
  Eti_mean <- apply(Eti_prob_scale,c(2,3),mean)   
  # posterior etiology quantiles for each cause for each site
  Eti_q    <- apply(Eti_prob_scale,c(2,3),quantile,c(0.025,0.975))
  
  # marginalized posteior etiology ignoring site
  Eti_overall <- apply(Eti_prob_scale,c(3,1),mean)
  # posteior etiology mean for each cause across all sites
  Eti_overall_mean <- rowMeans(Eti_overall)
  # posteior etiology quantiles for each cause across all sites
  Eti_overall_q    <- apply(Eti_overall,1,quantile,c(0.025,0.975))
  
  
  # weight to marginalize posterior etiology distributions
  user_weight <- rep(1/n_unique_Eti_level,n_unique_Eti_level) # c(0.3,0.2,0.1,0.1,0.1,0.1,0.1)
  if(!is.null(strata_weights) && length(strata_weights==n_unique_Eti_level)){
    user_weight <- strata_weights
  }
  # marginalized posterior etiology over all sites using user-defined weights
  Eti_overall_usr_weight <- apply(Eti_prob_scale,1,function(S) t(S)%*%matrix(user_weight,ncol=1))
  # marginalized posterior etiology mean using user-defined weights
  Eti_overall_mean_usr_weight <- rowMeans(Eti_overall_usr_weight)
  # marginalized posterior etiology quantiles using user-defined weights
  Eti_overall_q_usr_weight    <- apply(Eti_overall_usr_weight,1,quantile,c(0.025,0.975))
  
  # plot posterior distribution for etiology probability
  par(mfcol=c(1+n_unique_Eti_level,Jcause),mar=c(3,10,1,0))
  for (j in 1:Jcause){
    # if (j==1) {par(mar=c(3,0,2,0))}
    # if (j>1)  {par(mar=c(3,0,1,0))}
    for (site in 1:n_unique_Eti_level){
      hist(Eti_prob_scale[,site,j],xlim=c(0,1),breaks="Scott",freq=FALSE,main="",xlab="")
      if (!is.null(truth$allEti)){
        abline(v = truth$allEti[[site]][[j]], col="blue", lwd=3, lty=2) # mark the truth.
        q_interval <- quantile(Eti_prob_scale[,site,j],c(0.025,0.975))
        is_included <- truth$allEti[[site]][[j]] < q_interval[2] && truth$allEti[[site]][[j]] > q_interval[1]
        abline(v = q_interval,col=c("gray","red")[2-is_included],lwd=2,lty=1)
      }
      mtext(text = paste0('level',levels(as.factor(data_nplcm$X$SITE))[site],": ",
                          Jcause[j]),3,-1,cex=2,adj = 0.9)
      if (j==1){
        mtext(paste0(round(user_weight[site],4)),2,5,cex=2,col="blue",las=1)
        if (site==ceiling(n_unique_Eti_level/2)) {mtext("User-specified weight towards overall pie:", 2,12, cex=3)}
      }
    }
    hist(Eti_overall_usr_weight[j,],xlim=c(0,1),breaks="Scott",freq=FALSE,main="",col="blue",
         xlab="Etiology")
    if (!is.null(truth$allEti)){
      truth_overall <- c(matrix(user_weight,nrow=1)%*%do.call(rbind,truth$allEti))
      abline(v=truth_overall[site],col="orange",lty=1,lwd=2) # mark the truth.
      q_interval_Eti_overall <- quantile(Eti_overall_usr_weight[j,],c(0.025,0.975))
      is_included_Eti_overall <- truth_overall < q_interval_Eti_overall[2] && 
        truth_overall > q_interval_Eti_overall[1]
      abline(v = q_interval_Eti_overall,col=c("gray","red")[2-is_included_Eti_overall],lwd=2,lty=1)
      
    }
    mtext(text = paste0("Overall: ",model_options$likelihood$cause_list[j]),3,adj=0.9,cex=2,col="blue")
  }
  }