R/summary_functions.R

Defines functions forest_cpbayes post_summaries estimate_directions overall_pleio_measure select_subset

Documented in forest_cpbayes post_summaries

 ##=================================************************************************=================================##
 ## This file contains functions that are used for summarizing the MCMC data generated to make inference on pleiotropy.
 ## These functions are also the same for both uncorrelated and correlated versions of CPBayes.
 ## There is also one function which can be used for making some post summaries for interesting variants
 ## For example, estimating the direction of associations and posterior mean/median of beta or odds ratios
 ##=================================************************************************=================================##

 ##======================********** subset selection using the MCMC sample data ***********==========================##

 select_subset <- function( K, Z.data, mcmc.samplesize )
 {
     #asso.pr = colSums(Z.data)/mcmc.samplesize
     z = as.data.frame(Z.data)
     Z.summary = aggregate(z, by=z, length)[1:(ncol(z)+1)]
     row.no = which.max(Z.summary[,K+1])
     select.Z = Z.summary[row.no,]
     select.Z = select.Z[-(K+1)]
     subset = which(select.Z==1)
     #Z.sort.summary.uncor = Z.summary[order(Z.summary[,K+1]),] ;
     #data <- list( subset = subset, asso.pr = asso.pr )
     return(subset)
 }

 ##======================********** compute log10BF and PPNA using the MCMC sample data ***********==================##

 overall_pleio_measure <- function( K, shape1, shape2, sample_probZ_zero )
 {
     ## calculate the Bayes factor, first compue the posterior odds
     log_probZ_zero <- log(sample_probZ_zero)

     log_probZ_zero <- rowSums(log_probZ_zero)
     probZ_zero <- exp(log_probZ_zero)
     poste_deno <- mean(probZ_zero)

     log_posterior_deno <- log(poste_deno)
     log_posterior_nume <- log(1-poste_deno)
     log_posterior_odds <- log_posterior_nume - log_posterior_deno

     ## compute the prior odds
     p1 <- shape1/(shape1+shape2)
     p0 <- 1-p1; log.p0 <- log(p0); log.nume <- log(1-(p0^K));
     log.deno = K*log.p0; log.prior.odds = log.nume - log.deno;

     log_BF <- log_posterior_odds-log.prior.odds
     log10_BF <- log10(exp(log_BF))
     if(log10_BF == Inf) log10_BF <- 300                        ## assigning an upper bound if the BF becomes infinity

     ## compute the PPNA using the Posterior odds computed in the above
     a <- exp(log_posterior_odds)
     log_deno <- log(1+a)
     PPNA <- exp(-log_deno)
     if(PPNA == 0) PPNA <- 10^(-300)                            ## assigning an lower bound if it becomes zero.

     data <- list( log10_BF = log10_BF, PPNA = PPNA )
     return(data)
 }

 ##===============********** Estimate directions of associations using the MCMC sample data ***********==============##

 estimate_directions <- function( K, sim.beta)
 {
     ## estimation of the effect directions
     direction <- sign(sim.beta)
     direction <- (direction+1)/2                                            ## convert from +1/-1 scale to 0/1 scale
     direction_prob <- colMeans(direction)                                   ## probability of an effect being positive

     positive_association <- direction_prob>0.5
     effect_direction <- character(K)
     effect_direction[which(positive_association==TRUE)] <- "positive"
     effect_direction[which(positive_association==FALSE)] <- "negative"

     direction <- effect_direction
 }

 ##===========================********** post summaries for a pleiotropy signal ***********==========================##
 ## (1-level)% credible interval to be computed by CPBayes
 ## default value of level = 0.05. It also computes posterior summary of beta and odds ratio
 ##===========================*************************************************************==========================##
## Post processing of the mcmc data genarated by MCMC.
#' Post summary of the MCMC data generated by the uncorrelated or correlated version of CPBayes.
#'
#' Run the \code{\link{post_summaries}} function to summarize the MCMC data produced by
#'  \code{\link{cpbayes_uncor}} or \code{\link{cpbayes_cor}} and obtain meaningful insights
#'   into an observed pleiotropic signal.
#'
#' @param mcmc_output A list returned by either
#'  \code{\link{cpbayes_uncor}} or \code{\link{cpbayes_cor}}. This list
#' contains the primary results and MCMC data produced by \code{\link{cpbayes_uncor}}
#'  or \code{\link{cpbayes_cor}}. No default is specified. See the example below.
#' @param level A numeric value. (1-level)\% credible interval (Bayesian analog of the
#'  confidence interval) of the unknown true genetic effect (beta/odds ratio)
 #'   on each trait is computed. Default choice is 0.05.
#' @return The output produced by this function is a list that consists of various components.
#'    \item{variantName}{It is the name of the genetic variant provided by the user. If not
#'     specified by the user, default name is `Variant'.}
#'    \item{log10_BF}{It provides the log10(Bayes factor) produced by CPBayes that measures
#'     the evidence of the overall pleiotropic association.}
#'    \item{locFDR}{It provides the local false discovery rate (posterior probability of null association) produced by
#'     CPBayes (a Bayesian analog of the p-value) which is a measure of the evidence
#'      of aggregate-level pleiotropic association. Bayes factor is adjusted for prior odds, but
 #'      locFDR is solely a function of posterior odds. locFDR can sometimes be significantly small
 #'      indicating an association, but log10_BF may not. Hence, always check both log10_BF and locFDR.}
#'    \item{subset}{A data frame providing the optimal subset of associated/non-null traits
#'     along with their trait-specific posterior probability of association (PPAj) and direction
#'      of associations. It is NULL if no phenotype is selected by CPBayes.}
#'    \item{important_traits}{It provides the traits which yield a trait-specific posterior
#'     probability of association (PPAj) > 20\%. Even if a phenotype is not selected in the
#'    optimal subset of non-null traits, it can produce a non-negligible value of
#'     trait-specific posterior probability of association. We note that `important_traits'
#'      is expected to include the traits already contained in `subset'. It provides the
#'       name of the important traits and their trait-specific posterior probability of
#'        association (PPAj) and the direction of associations. Always check
#'         'important_traits' even if 'subset' contains a single trait.
#'          It helps to better explain an observed pleiotropic signal.}
#'     \item{traitNames}{It returns the name of all the phenotypes specified by the user.
#'      Default is trait1, trait2, ... , traitK.}
#'     \item{PPAj}{Data frame providing the trait-specific posterior probability of
#'      association for all the phenotypes.}
#'     \item{poste_summary_beta}{Data frame providing the posterior summary of the
#'      unknown true genetic effect (beta) on each trait. It gives posterior mean,
#'       median, standard error, credible interval (lower and upper limits) of the
#'        true beta corresponding to each trait.}
#'     \item{poste_summary_OR}{Data frame providing the posterior summary of the unknown
#'      true genetic effect (odds ratio) on each trait. It gives posterior mean, median,
#'       standard error, credible interval (lower and upper limits) of the true odds
#'        ratio corresponding to each trait.}
#'
#' @references Majumdar A, Haldar T, Bhattacharya S, Witte JS (2018) An efficient Bayesian meta analysis approach for studying cross-phenotype genetic associations. PLoS Genet 14(2): e1007139.
#'
#' @seealso \code{\link{cpbayes_uncor}}, \code{\link{cpbayes_cor}}
#'
#' @examples
#' data(ExampleDataUncor)
#' BetaHat <- ExampleDataUncor$BetaHat
#' BetaHat
#' SE <- ExampleDataUncor$SE
#' SE
#' traitNames <- paste("Disease", 1:10, sep = "")
#' SNP1 <- "rs1234"
#' result <- cpbayes_uncor(BetaHat, SE, Phenotypes = traitNames, Variant = SNP1)
#' PleioSumm <- post_summaries(result, level = 0.05)
#' str(PleioSumm)
#'
#' @export
 post_summaries <- function( mcmc_output, level = 0.05 )
 {

     genetic_variant <- mcmc_output$variantName
     log10_BF <- mcmc_output$log10_BF
     PPNA <- mcmc_output$locFDR
     subset <- mcmc_output$subset
     important_phenos <- mcmc_output$important_traits

  	 MCMC_data <- mcmc_output$auxi_data

  	 traitNames <- MCMC_data$traitNames
     K <- MCMC_data$K
     mcmc.samplesize <- MCMC_data$mcmc.samplesize
     asso.pr <- MCMC_data$PPAj
     Z.data <- MCMC_data$Z.data
     sim.beta <- MCMC_data$sim.beta

     asso_prob = data.frame( traits = traitNames, PPAj = asso.pr, stringsAsFactors = FALSE)

     ## estimate the direction of associations
     alldirection <- estimate_directions( K, sim.beta )

     optimal_subset <- data.frame( traits = subset, stringsAsFactors = FALSE)

     if( length(subset) > 0 ){
         selected <- match(subset, traitNames)
         optimal_subset$PPAj <- asso.pr[selected]
         optimal_subset$direction <- alldirection[selected]
     }

     imp_traits <- important_phenos$traits
     if( length(imp_traits) > 0 ){
         selected <- match(imp_traits, traitNames)
         direct <- alldirection[selected]
         important_phenos$direction = direct
     }

     ## for summarizing the 'beta' parameters themselves
     poste_mean_beta <- 0
     poste_median_beta <- 0
     poste_se_beta <- 0
     CI_l <- 0                                                  ## lower limit of 95% credible interval for beta
     CI_u <- 0                                                  ## upper limit of 95% credible interval for beta

     poste_mean_OR <- 0                                         ## for summarizing the odds ratio (OR) = exp(beta)
     poste_median_OR <- 0
     CI_OR_l <- 0                                               ## lower limit of 95% credible interval for odds ratios
     CI_OR_u <- 0                                               ## upper limit of 95% credible interval for odds ratios

     lev <- level
     lev1 <- lev/2
     lev2 <- 1-(lev/2)

     for( k in 1:K ){
          poste_mean_beta[k] <- mean(sim.beta[,k])
          poste_se_beta[k] <- sd(sim.beta[,k])
          beta_qtls <- quantile( sim.beta[,k], prob = c( lev1, 0.5, lev2 ) )
          CI_l[k] <- beta_qtls[1]
          poste_median_beta[k] <- beta_qtls[2]
          CI_u[k] <- beta_qtls[3]

          ORs <- exp(sim.beta[,k])
          poste_mean_OR[k] <- mean(ORs)
          or_qtls <- quantile( ORs, prob = c( lev1, 0.5, lev2 ) )
          CI_OR_l[k] <- or_qtls[1]
          poste_median_OR[k] <- or_qtls[2]
          CI_OR_u[k] <- or_qtls[3]
     }

     poste_summary_beta = data.frame( traits = traitNames, poste_mean = poste_mean_beta,
                          poste_median = poste_median_beta, poste_se = poste_se_beta, lCl = CI_l,
                          uCl = CI_u, stringsAsFactors = FALSE)

     poste_summary_OR = data.frame( traits = traitNames, poste_mean = poste_mean_OR, poste_median = poste_median_OR,
                        lCl = CI_OR_l, uCl = CI_OR_u, stringsAsFactors = FALSE)

     data = list( variantName = genetic_variant, log10_BF = log10_BF, locFDR = PPNA, subset = optimal_subset,
            important_traits = important_phenos, traitNames = traitNames, PPAj = asso_prob,
            poste_summary_beta = poste_summary_beta, poste_summary_OR = poste_summary_OR )
 }


##===========================********** Forest plot for a pleiotropy signal ***********==========================##
## (1-level)% confidence interval of beta parameter is to be plotted in the forest plot. default value of level = 0.05.
##===========================*************************************************************==========================##
## Forest plot presenting pleiotropy result obtained by CPBayes.
#' Forest plot presenting pleiotropy result obtained by CPBayes.
#'
#' Run the \code{\link{forest_cpbayes}} function to create a forest plot that presents the pleiotropy result obtained
#' by \code{\link{cpbayes_uncor}} or \code{\link{cpbayes_cor}}.
#'
#' @param mcmc_output A list returned by either
#'  \code{\link{cpbayes_uncor}} or \code{\link{cpbayes_cor}}. This list
#' contains all the primary results and MCMC data produced by \code{\link{cpbayes_uncor}}
#'  or \code{\link{cpbayes_cor}}. No default is specified. See the example below.
#' @param level A numeric value. (1-level)\% confidence interval of the unknown true genetic effect (beta/log(odds ratio))
#'   on each trait is plotted in the forest plot. Default choice is 0.05.
#' @param PPAj_cutoff A numeric value. It's a user-specified threshold of PPAj (trait-specific posterior probability
#'   of association). Only those traits having PPAj values above this cut-off are included in the forest plot. So, the choice of
#'   this variable as '0.0' includes all traits in the forest plot. Default is 0.01.
#' @return The output produced by this function is a diagram file in .pdf format. The details of the diagram are as follows:
#'    \item{file_name}{The pdf file is named after the genetic variant. So, if the argument `Variant'
#'    in \code{\link{cpbayes_uncor}} or \code{\link{cpbayes_cor}} is specified as 'rs1234', the figure file is named as rs1234.pdf.}
#'    \item{Column1}{First column in the figure specifies the name of the phenotypes.}
#'    \item{Column2}{Second column provides the trait-specific univariate association p-value for a trait.}
#'    \item{Column3}{Third column provides the trait-specific posterior probability of association (PPAj) produced by CPBayes.}
#'    \item{Column4}{Fourth column states whether a phenotype was selected in the optimal subset of associated/non-null traits
#'    detected by CPBayes. If a phenotype was not selected, selected and positively associated, selected and negatively associated,
#'     its association status is stated as null, positive and negative, respectively.}
#'    \item{Column5}{In the right section of the figure, the primary eatimate and confidence interval of the beta/log odds ratio parameter for
#'     a trait is plotted.}
#' @references Majumdar A, Haldar T, Bhattacharya S, Witte JS (2018) An efficient Bayesian meta analysis approach for studying cross-phenotype genetic associations. PLoS Genet 14(2): e1007139.
#'
#' @seealso \code{\link{cpbayes_uncor}}, \code{\link{cpbayes_cor}}
#'
#' @examples
#' data(ExampleDataUncor)
#' BetaHat <- ExampleDataUncor$BetaHat
#' SE <- ExampleDataUncor$SE
#' traitNames <- paste("Disease", 1:10, sep = "")
#' SNP1 <- "rs1234"
#' result <- cpbayes_uncor(BetaHat, SE, Phenotypes = traitNames, Variant = SNP1)
#' \dontrun{forest_cpbayes(result, level = 0.05)}
#'
#' @export
forest_cpbayes <- function(mcmc_output, level = 0.05, PPAj_cutoff = 0.01){

   result <- mcmc_output
   betahat <- result$auxi_data$betahat
   se <- result$auxi_data$se
   summ <- post_summaries(result)                          ## summary of CPBayes results
   traits <- summ$traitNames                               ## phenotypes
   K <- length(traits)                                     ## number of traits
   selection <- rep("null", K)                             ## if no trait is selected

   if(is.null(summ$subset) == FALSE){
     selected_traits <- summ$subset$traits
     select_trait_posi <- match(selected_traits, traits)
     direction <- summ$subset$direction
     selection[select_trait_posi] <- direction
   }

   upper_alfa <- abs(qnorm( (level/2), 0,1))
   shift <- upper_alfa*se
   lowCIbeta <- betahat - shift                            ## lower confidence interval of beta
   upCIbeta <- betahat + shift                             ## upper confidence interval of
   pvalues <- pchisq( (betahat/se)^2, df=1, lower.tail=F )

   for(j in 1:K){
     x <- pvalues[j]
     count <- 0
     while(x < 1){ x <- 10*x; count <- count+1 }
     pvalues[j] <- round(pvalues[j], digits = count)     ## or, digits = count
   }

   PPAj <- summ$PPAj$PPAj
   select = which(PPAj > PPAj_cutoff)
   PPAj <- round(100*summ$PPAj$PPAj, digits = 1)

   PPAj <- paste(as.character(PPAj), "%", sep = "")
   labeltext <- data.frame( Trait = traits, Pvalue = as.character(pvalues), PPAj = PPAj, selection = selection, stringsAsFactors = FALSE )

   df_names <- data.frame( Trait = "Trait", Pvalue = "pvalue", PPAj = "PPAj", selection = "association", stringsAsFactors=F)
   labeltext = rbind(df_names,labeltext)

   betahat <- c(NA, betahat)
   lowCIbeta <- c(NA, lowCIbeta)
   upCIbeta <- c(NA, upCIbeta)


   log10BF <- round(summ$log10_BF, digits = 2)
   PPNA <- summ$locFDR
   x <- PPNA
   count <- 0
   while(x < 1){ x <- 10*x; count <- count+1 }
   PPNA <- round(PPNA, digits = count+1)

   if(length(select) > 0){

     select <- select+1
     select <- c(1,select)
     title <- summ$variantName
     pdffile <- paste(title, ".pdf", sep = "")
     title <- paste("Pleiotropy at ", title, sep = "")   ##  "PPNA = ", PPNA,
     pdf(pdffile)

     forestplot(labeltext[select, ], betahat[select], lowCIbeta[select], upCIbeta[select],
                zero = 0, lineheight = "auto", boxsize = 0.12, xlab = "Estimate and CI of log(OR)",
                col = fpColors(lines="red", box="darkred"), title = title, new_page = FALSE)
     # boxsize = 0.12,
     dev.off()
   }else{
     print("Forest plot not created, because no trait has a PPA_j value above the threshold specified.")
   }

}
ArunabhaCodes/CPBayes documentation built on Dec. 2, 2020, 10:22 p.m.