knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
library(winnerscurse)

Our R package, winnerscurse, also contains several functions which have the ability to obtain more accurate SNP-trait association estimates when the summary statistics from two genetic association studies, namely the discovery and replication GWASs, are available. Thus, this second vignette demonstrates how the package can be used to obtain new adjusted association estimates for each SNP using these two data sets. Similar to the first vignette which focused on 'discovery only' methods, we first use the function sim_stats to create two toy data sets and then, illustrate how a user may employ the package's functions to adjust for the bias imposed by Winner's Curse.

The methods that are currently available for implementation include:

  1. Conditional Likelihood method - adapted from Zhong and Prentice (2008)
  2. UMVCUE method - Bowden and Dudbridge (2009)

It is important to note that with both of these methods, adjustments are only made to SNPs that are deemed significant in the discovery data set, i.e. the $p$-values in the discovery data set of these SNPs are less than that of the specified threshold, $\alpha$.

A third method has also been added which obtains a new association estimate for each significant SNP in the discovery data set using a combination of the discovery and replication estimates:

  1. MSE minimization method - based on Ferguson et al. (2017)

Creating two toy data sets

set.seed(1998)

sim_dataset <- sim_stats(nsnp=10^6,h2=0.4,prop_effect=0.01,nid=50000,rep=TRUE,rep_nid=50000)

## simulated discovery GWAS summary statistics
disc_stats <- sim_dataset$disc
head(disc_stats)

## simulated replication GWAS summary statistics
rep_stats <- sim_dataset$rep
head(rep_stats)

Method 1: Conditional Likelihood

out_CL <- condlike_rep(summary_disc=disc_stats, summary_rep=rep_stats, alpha=5e-8)
head(out_CL)

$~$

Confidence intervals:

out_CL_conf <- condlike_rep(summary_disc=disc_stats, summary_rep=rep_stats, alpha=5e-8, conf_interval=TRUE, conf_level=0.95)
head(out_CL_conf)

[$\star$]{style="color: blue;"} Note: As the current form of condlike_rep uses the R function uniroot which aims to find values for beta_MLE_lower and beta_MLE_upper numerically, it is possible that uniroot may fail to successfully achieve this in some situations. We thus urge users to take care when using condlike_rep to obtain confidence intervals and to be mindful of this potential failure of uniroot.

Method 2: UMVCUE

out_UMVCUE <- UMVCUE(summary_disc = disc_stats, summary_rep = rep_stats, alpha = 5e-8)
head(out_UMVCUE)

Method 3: MSE minimization

out_minimizerA <- MSE_minimizer(summary_disc = disc_stats, summary_rep = rep_stats, alpha=5e-8, spline=FALSE)
out_minimizerB <- MSE_minimizer(summary_disc = disc_stats, summary_rep = rep_stats, alpha=5e-8)

head(out_minimizerA)
head(out_minimizerB)

Comparing Results

Just as in the first vignette, we can briefly compare the performance of our correct methods using measures such as the estimated root mean square error of significant SNPs (rmse) and the estimated average bias over all significant SNPs (bias). The significant SNPs we refer to here are those that have been deemed significant at a threshold of $\alpha=5 \times 10^{-8}$ in the discovery GWAS.

## Simulated true effect sizes:
true_beta <- sim_dataset$true$true_beta

The estimated root mean square error of significant SNPs for each method is computed below. It is clear that all methods show an improvement on the estimates obtained from the discovery data set. However, certain further investigation will be required in order to evaluate if the adjusted estimates are less biased or more accurate than the mere use of replication estimates.

## rmse
rmse <- data.frame(disc = sqrt(mean((out_CL$beta_disc - true_beta[out_CL$rsid])^2)), rep = sqrt(mean((out_CL$beta_rep - true_beta[out_CL$rsid])^2)), CL_com = sqrt(mean((out_CL$beta_com - true_beta[out_CL$rsid])^2)), CL_MLE = sqrt(mean((out_CL$beta_MLE - true_beta[out_CL$rsid])^2)), CL_MSE = sqrt(mean((out_CL$beta_MSE - true_beta[out_CL$rsid])^2)), UMVCUE = sqrt(mean((out_UMVCUE$beta_UMVCUE - true_beta[out_UMVCUE$rsid])^2)), minimizerA = sqrt(mean((out_minimizerA$beta_joint - true_beta[out_minimizerA$rsid])^2)), minimizerB = sqrt(mean((out_minimizerB$beta_joint - true_beta[out_minimizerB$rsid])^2)))
rmse

The next metric, the average bias over all significant SNPs with positive association estimates, is included below.

## bias positive
pos_sig <- out_CL$rsid[out_CL$beta_disc > 0]
pos_sigA <- which(out_CL$rsid %in% pos_sig)

bias_up <- data.frame(disc = mean(out_CL$beta_disc[pos_sigA] - true_beta[pos_sig]), rep = mean(out_CL$beta_rep[pos_sigA] - true_beta[pos_sig]), CL_com = mean(out_CL$beta_com[pos_sigA] - true_beta[pos_sig]), CL_MLE = mean(out_CL$beta_MLE[pos_sigA] - true_beta[pos_sig]), CL_MSE = mean(out_CL$beta_MSE[pos_sigA] - true_beta[pos_sig]), UMVCUE = mean(out_UMVCUE$beta_UMVCUE[pos_sigA] - true_beta[pos_sig]), minimizerA = mean(out_minimizerA$beta_joint[pos_sigA] - true_beta[pos_sig]), minimizerB = mean(out_minimizerB$beta_joint[pos_sigA] - true_beta[pos_sig]))
bias_up

In a similar manner, the average bias over all significant SNPs with negative association estimates, is computed.

## bias negative
neg_sig <- out_CL$rsid[out_CL$beta_disc < 0]
neg_sigA <- which(out_CL$rsid %in% neg_sig)

bias_down <- data.frame(disc = mean(out_CL$beta_disc[neg_sigA] - true_beta[neg_sig]), rep = mean(out_CL$beta_rep[neg_sigA] - true_beta[neg_sig]), CL_com = mean(out_CL$beta_com[neg_sigA] - true_beta[neg_sig]), CL_MLE = mean(out_CL$beta_MLE[neg_sigA] - true_beta[neg_sig]), CL_MSE = mean(out_CL$beta_MSE[neg_sigA] - true_beta[neg_sig]), UMVCUE = mean(out_UMVCUE$beta_UMVCUE[neg_sigA] - true_beta[neg_sig]), minimizerA = mean(out_minimizerA$beta_joint[neg_sigA] - true_beta[neg_sig]), minimizerB = mean(out_minimizerB$beta_joint[neg_sigA] - true_beta[neg_sig]))
bias_down

To complement the above, we provide an illustration of boxplots obtained for the root mean square error of significant SNPs (RMSE of sig. SNPs at $5 \times 10^{-8}$), plotted for each Winner's Curse correction method (Method).

library(RColorBrewer)
library(ggplot2)
col <- brewer.pal(8,"Dark2")
col1 <- brewer.pal(11,"RdYlBu")

all_results <- data.frame(rsid = c(rep(out_CL$rsid,8)), beta_disc = c(rep(out_CL$beta_disc,8)), se_disc = c(rep(out_CL$se_disc,8)), adj_beta = c(out_CL$beta_disc,out_CL$beta_rep,out_CL$beta_com,out_CL$beta_MLE,out_CL$beta_MSE,out_UMVCUE$beta_UMVCUE,out_minimizerA$beta_joint,out_minimizerB$beta_joint), method = c(rep("disc",33),rep("rep",33),rep("CL_com",33),rep("CL_MLE",33),rep("CL_MSE",33),rep("UMVCUE",33),rep("minimizerA",33),rep("minimizerB",33)))
all_results$rmse <- sqrt((all_results$adj_beta - true_beta[all_results$rsid])^2)
all_results$method <- factor(all_results$method, levels=c("disc","rep","CL_com", "CL_MLE", "CL_MSE", "UMVCUE", "minimizerA", "minimizerB"))

ggplot(all_results,aes(x=method,y=rmse,fill=method, color=method)) + geom_boxplot(size=0.7,aes(fill=method, color=method, alpha=0.2)) + scale_fill_manual(values=c(col[1],col[2],col[3],col[4],col[5],col[6],col[7],col[8])) + scale_color_manual(values=c(col[1],col[2],col[3],col[4],col[5],col[6],col[7],col[8]))  + xlab("Method") +
  ylab(expression(paste("RMSE of sig. SNPs at  ", 5%*%10^-8))) + theme_bw() + geom_hline(yintercept=0, colour="black") +  theme(axis.text.x=element_text(angle=90, vjust=0.5, hjust=1),text = element_text(size=12),legend.position = "none", strip.text = element_text(face="italic")) 

In addition, we can gain a visual insight into the adjustments made by these functions by constructing a density plot with the adjusted absolute values along with the naive estimates and the true absolute $\beta$ values of significant SNPs, as follows:

true <- data.frame(rsid = c(out_CL$rsid), beta_disc =c(out_CL$beta_disc), se_disc =c(out_CL$se_disc),adj_beta=true_beta[out_CL$rsid],method=c(rep("true",33)))
all_resultsA <- rbind(all_results[,1:5],true)

ggplot(all_resultsA, aes(x=abs(adj_beta),colour=method,fill=method)) + geom_density(alpha=0.05,size=1) + scale_fill_manual(values=c(col[1],col[2],col[3],col[4],col[5],col[6],col[7],col[8],col1[11])) + scale_color_manual(values=c(col[1],col[2],col[3],col[4],col[5],col[6],col[7],col[8],col1[11])) + xlim(-0.01,0.08) +
  ylab("Density") + xlab(expression(paste("|", beta, "| " , "of sig. SNPs at  ", 5%*%10^-8))) + theme_bw() +  theme(text = element_text(size=12)) + labs(fill = "Method",colour="Method")

[$\star$]{style="color: blue;"} Note: In the above plot, it can be seen that there must be very little difference between the replication estimates and the adjusted estimates obtained using the UMVCUE method as the density curves of rep and UMVCUE can be seen to nearly overlap completely.

$~$ $~$

Using empirical Bayes and bootstrap methods with replication data sets

When a replication data set is also available, as has been described throughout this vignette, we also potentially have the option to employ the empirical Bayes and bootstrap methods. We have seen this work well in simulations, especially in terms of reducing the mean square error (MSE) and thus, consider exploration of their implementation in this setting. With data in the form described above similar to disc_stats and rep_stats, implementation of both methods could take place on the combined estimator as shown below:

## combined estimator:
com_stats <- data.frame(rsid = disc_stats$rsid, beta = ((((rep_stats$se)^2)*(disc_stats$beta))+(((disc_stats$se)^2)*(rep_stats$beta)))/(((disc_stats$se)^2) + ((rep_stats$se)^2)), se = sqrt((((disc_stats$se)^2)*((rep_stats$se)^2))/(((disc_stats$se)^2) + ((rep_stats$se)^2))))

## apply methods:
out_EB_com <- empirical_bayes(com_stats)
out_boot_com <- BR_ss(com_stats,seed_opt=TRUE,seed=2020)

## rearrange data frames with SNPs ordered 1,2,3.. 
out_EB_com <- dplyr::arrange(out_EB_com,out_EB_com$rsid)
out_boot_com <- dplyr::arrange(out_boot_com,out_boot_com$rsid)
out_EB_com <- data.frame(rsid = disc_stats$rsid, beta_disc = disc_stats$beta, se_disc  = disc_stats$se, beta_rep =rep_stats$beta, se_rep = rep_stats$se, beta_EB=out_EB_com$beta_EB)
out_boot_com <- data.frame(rsid = disc_stats$rsid, beta_disc = disc_stats$beta, se_disc  = disc_stats$se, beta_rep =rep_stats$beta, se_rep = rep_stats$se, beta_boot=out_boot_com$beta_BR_ss)

## reorder data frames based on significance in discovery data set:
out_EB_com <- dplyr::arrange(out_EB_com, dplyr::desc(abs(out_EB_com$beta_disc/out_EB_com$se_disc)))
out_EB_com <- out_EB_com[abs(out_EB_com$beta_disc/out_EB_com$se_disc) > stats::qnorm((5e-8)/2, lower.tail=FALSE),]
head(out_EB_com)

out_boot_com <- dplyr::arrange(out_boot_com, dplyr::desc(abs(out_boot_com$beta_disc/out_boot_com$se_disc)))
out_boot_com <- out_boot_com[abs(out_boot_com$beta_disc/out_boot_com$se_disc) > stats::qnorm((5e-8)/2, lower.tail=FALSE),]
head(out_boot_com)

We now compute similar evaluation metrics to above and produce the two plots in order to demonstrate the performance of the above approaches.

## rmse
rmse <- data.frame(disc = sqrt(mean((out_CL$beta_disc - true_beta[out_CL$rsid])^2)), rep = sqrt(mean((out_CL$beta_rep - true_beta[out_CL$rsid])^2)), EB_com = sqrt(mean((out_EB_com$beta_EB - true_beta[out_CL$rsid])^2)), boot_com = sqrt(mean((out_boot_com$beta_boot - true_beta[out_CL$rsid])^2)))
rmse

## bias positive
bias_up <- data.frame(disc = mean(out_CL$beta_disc[pos_sigA] - true_beta[pos_sig]), rep = mean(out_CL$beta_rep[pos_sigA] - true_beta[pos_sig]), EB_com = mean(out_EB_com$beta_EB[pos_sigA] - true_beta[pos_sig]), boot_com = mean(out_boot_com$beta_boot[pos_sigA] - true_beta[pos_sig]))
bias_up

## bias negative
bias_down <- data.frame(disc = mean(out_CL$beta_disc[neg_sigA] - true_beta[neg_sig]), rep = mean(out_CL$beta_rep[neg_sigA] - true_beta[neg_sig]), EB_com = mean(out_EB_com$beta_EB[neg_sigA] - true_beta[neg_sig]), boot_com = mean(out_boot_com$beta_boot[neg_sigA] - true_beta[neg_sig]))
bias_down
par(mfrow = c(2, 1))
all_results <- data.frame(rsid = c(rep(out_CL$rsid,4)), beta_disc = c(rep(out_CL$beta_disc,4)), se_disc = c(rep(out_CL$se_disc,4)), adj_beta = c(out_CL$beta_disc,out_CL$beta_rep,out_EB_com$beta_EB,out_boot_com$beta_boot), method = c(rep("disc",33),rep("rep",33),rep("EB_com",33),rep("boot_com",33)))
all_results$rmse <- sqrt((all_results$adj_beta - true_beta[all_results$rsid])^2)
all_results$method <- factor(all_results$method, levels=c("disc","rep","EB_com", "boot_com"))
ggplot(all_results,aes(x=method,y=rmse,fill=method, color=method)) + geom_boxplot(size=0.7,aes(fill=method, color=method, alpha=0.2)) + scale_fill_manual(values=c(col[1],col[2],col[3],col[4])) + scale_color_manual(values=c(col[1],col[2],col[3],col[4]))  + xlab("Method") +
  ylab(expression(paste("RMSE of sig. SNPs at  ", 5%*%10^-8))) + theme_bw() + geom_hline(yintercept=0, colour="black") +  theme(axis.text.x=element_text(angle=90, vjust=0.5, hjust=1),text = element_text(size=10),legend.position = "none", strip.text = element_text(face="italic")) 
true <- data.frame(rsid = c(out_CL$rsid), beta_disc =c(out_CL$beta_disc), se_disc =c(out_CL$se_disc),adj_beta=true_beta[out_CL$rsid],method=c(rep("true",33)))
all_resultsA <- rbind(all_results[,1:5],true)
ggplot(all_resultsA, aes(x=abs(adj_beta),colour=method,fill=method)) + geom_density(alpha=0.05,size=1) + scale_fill_manual(values=c(col[1],col[2],col[3],col[4],col[8])) + scale_color_manual(values=c(col[1],col[2],col[3],col[4],col[8])) + xlim(-0.01,0.08) +
  ylab("Density") + xlab(expression(paste("|", beta, "| " , "of sig. SNPs at  ", 5%*%10^-8))) + theme_bw() +  theme(text = element_text(size=10),legend.position="bottom") + labs(fill = "Method",colour="Method") +guides(fill=guide_legend(nrow=2,byrow=TRUE))


amandaforde/winnerscurse documentation built on March 9, 2024, 6:09 p.m.