###### Function to calculate BF and p-values ######
# #' Calculate Bayes Factor and empirical p-values from observed Z-scores and priors
# #'
# #' @inheritParams bGWAS
# NOT EXPORTED
request_BFandP <- function(Prior, sign_thresh, use_permutations = F,
sign_method, save_files=F, verbose=F) {
Log = c()
tmp = paste0("# Computing observed Bayes Factor for all SNPs... \n")
Log = update_log(Log, tmp, verbose)
# calculate BF
Prior %>%
mutate(BF = stats::dnorm(mean= .data$mu_prior_estimate,
sd=sqrt(1 + .data$mu_prior_std_error**2),
x = .data$z_obs) /
stats::dnorm(mean = 0.0,
sd=sqrt(1),
x = .data$z_obs)) %>% # and order by BF
arrange(desc(.data$BF)) -> Prior
tmp = "Done! \n"
Log = update_log(Log, tmp, verbose)
# true Z = just rs + log.bf ???
tmp = "# Computing BF p-values... \n"
Log = update_log(Log, tmp, verbose)
if(use_permutations){# The true one : calculate BF
tmp = " using a permutation approach: \n"
# how many null z-scores should we simulate?
nrow(Prior) -> N.one.set.of.nulls
# Function to count how many larger BF were observed in the simulation
count_larger <- Rcpp::cppFunction(' IntegerVector count_larger(NumericVector real, NumericVector null) {
int N_real = real.size();
int N_null = null.size();
// both files are in descending order. Make use of this
IntegerVector result(N_real);
int j = 0; // the observed one is smaller than (not equal) this many nulls
for(int i=0; i<N_real; ++i) {
double real_i = real.at(i);
while(j < N_null && null.at(j) > real_i) {
++j;
}
result.at(i) = j;
}
return result;
} ', env=NULL)
# counts for each SNPs : start with all 0
running.total = rep.int(0 , nrow(Prior))
tmp = "# Computing null Bayes Factors (needed to compute empirical p-values)... \n"
Log = update_log(Log, tmp, verbose)
NUMBER_OF_NULLRUNS = 1000
full.number.of.nulls.for.comparison = NUMBER_OF_NULLRUNS* N.one.set.of.nulls
tmp = paste0(NUMBER_OF_NULLRUNS, " random z-scores simulations for the whole set of SNPs (",
format(nrow(Prior), big.mark = ",", scientific = F), ") : ", format(full.number.of.nulls.for.comparison, big.mark = ",", scientific = F), " null BFs are calculated \n")
Log = update_log(Log, tmp, verbose)
for( i in 1:NUMBER_OF_NULLRUNS ) {
# null bf
if(i %% 20 == 1){
tmp = paste0("Simulation ",i, "... \n")
Log = update_log(Log, tmp, verbose)
}
# simulate z-score from normal distribution
# ... but this does not account for the real "true distribution" (inflation?)
# should we shuffle z-scores instead?
Prior %>%
mutate(z = stats::rnorm(N.one.set.of.nulls, 0, 1),
BF.null = stats::dnorm(mean= .data$mu_prior_estimate,
sd=sqrt(1+.data$mu_prior_std_error**2),
x=.data$z) /
stats::dnorm(mean= 0.0,
sd=sqrt(1),
x=.data$z)) %>%
arrange(desc(.data$BF.null))-> Prior_BF
Prior_BF %>%
pull(.data$BF.null) -> nullbf
count_larger(Prior$BF, nullbf) -> counts
stopifnot(length(counts) == nrow(Prior))
running.total = running.total + counts
}
tmp = "Done! \n"
Log = update_log(Log, tmp, verbose)
tmp = "# Computing empirical p-values... \n"
Log = update_log(Log, tmp, verbose)
Prior$z = NULL
Prior$BF.null = NULL
Prior$BF_p = (running.total+1) / full.number.of.nulls.for.comparison
} else {
tmp = " using a distribution approach: \n"
Log = update_log(Log, tmp, verbose)
### Functions to estimate p-value from BF/prior distribution
## Calculate a p-value using the non-linear quantiles
# Get p-value, for one BF value, using one vector of quantiles, one vector of weights, and one variance value
get_OneP <- function(myBF, perc, sigma2, weights){
suppressWarnings({ # warnings when p-value = NA (small BFS), deal with it below
pPercentiles = stats::pnorm( (-perc/sigma2) +
sqrt(1+sigma2)/sigma2 *
sqrt(perc**2 + 2*sigma2 * log(myBF * sqrt(1+sigma2))), lower.tail=F) +
stats::pnorm((-perc/sigma2) -
sqrt(1+sigma2)/sigma2 *
sqrt(perc**2 + 2*sigma2 * log(myBF * sqrt(1+sigma2))), lower.tail=T)
})
pPercentiles[is.na(pPercentiles)]=1
# multiply each p-value by the according weight
p = pPercentiles %*% weights
return(p)
}
## Get p-value, for one BF value, using all priors ("true value")
get_fullP <- function(myBF, Data){ # BF : value - Data : data.frame with prior_estimate and prior_std_error columns
suppressWarnings({ # warnings when p-value = NA (small BFS), deal with it below
Data %>%
mutate( pSNP = stats::pnorm( (-.data$mu_prior_estimate/(.data$mu_prior_std_error**2)) +
sqrt(1+.data$mu_prior_std_error**2)/(.data$mu_prior_std_error**2) *
sqrt(.data$mu_prior_estimate**2 + 2*.data$mu_prior_std_error**2 * log(myBF * sqrt(1+.data$mu_prior_std_error**2))), lower.tail=F) +
stats::pnorm((-.data$mu_prior_estimate/(.data$mu_prior_std_error**2)) -
sqrt(1+.data$mu_prior_std_error**2)/(.data$mu_prior_std_error**2) *
sqrt(.data$mu_prior_estimate**2 + 2*.data$mu_prior_std_error**2 * log(myBF * sqrt(1+.data$mu_prior_std_error**2))), lower.tail=T)) -> Data
})
Data %>%
mutate(pSNP = tidyr::replace_na(.data$pSNP, 1)) %>% # if NA -> replace by 1, that means that the polynome has no real solution and Pr(allBF>BF)=1
summarize(mean(.data$pSNP)) %>% as.numeric() -> p # here, we use all the prior values, so we can use an unweighted mean
return(p)
}
# Get all p-values for a vector of BFs, return a list with 2 elements : "pValue" & "log_info"
get_pValue<- function(myBFs, Data, sign_thr=5e-8, verbose=F){ # BF : vector of values - Data : data.frame with prior_estimate and prior_std_error columns
Log=c()
tmp = "... getting approximated p-values using non-linear quantiles \n"
Log = update_log(Log, tmp, verbose)
d=data.frame(BF=myBFs)
d$my_p=NA
# here, we define quantiles on the 10^... scale
# from 0 to 0.5, and then from 0.5 to 1
quant = c(0, 10^(seq(-10, 0, 0.1))/2 , 1-rev(10^(seq(-10, 0, 0.1))/2)[-1], 1)
#c(0, 10^(seq(-10, 0, 10/49))/2 , 1-rev(10^(seq(-10, 0, 10/49))/2)[-1])
# for the weights, we take quant[x] - quant[x-1] to get the "size" of the intervals
# and then, set the weight of a "quantile value" as half of the "size" of the interval before
# + half of the "size" of the interval after
sizes = quant[-1]-quant[-length(quant)]
wghts = numeric(length(quant))
for(i in 1:length(quant)){
if(i==1){
wghts[i] = sizes[i]/2
} else if(i==length(quant)){
wghts[i] = sizes[i-1]/2
} else {
wghts[i] = sizes[i]/2 + sizes[i-1]/2
}
}
# get the quantiles from the data
#quantUsed = quantile(Data$prior_estimate, quant[-c(length(quant))])
quantiles = stats::quantile(Data$mu_prior_estimate, quant)
# get the mean prior variance from the data
sigma2 = mean(Data$mu_prior_std_error**2)
# apply the function to each BF, using the parameters estimated just before
d$my_p = apply(d, 1, function(x) get_OneP(x[1], quantiles, sigma2, wghts))
tmp = "... checking p-values near significance threshold \n"
Log = update_log(Log, tmp, verbose)
# check if we have some false positive / false negative
count_corrected=0
# Identify last non-significant SNP
snp = min(which(d$my_p>sign_thr))
p_true = get_fullP(d$BF[snp], Data)
NeedToCorrect = T
while(NeedToCorrect){
if(p_true<sign_thr){
d$my_p[snp] = p_true
count_corrected = count_corrected+1
snp=snp+1
p_true = get_fullP(d$BF[snp], Data)
} else {
NeedToCorrect=F
}
}
# Identify first significant SNP
if(any(d$my_p<sign_thr)){
snp = max(which(d$my_p<sign_thr))
p_true = get_fullP(d$BF[snp], Data)
NeedToCorrect = T
while(NeedToCorrect){
if(p_true>sign_thr){
d$my_p[snp] = p_true
count_corrected = count_corrected+1
snp=snp-1
p_true = get_fullP(d$BF[snp], Data)
} else {
NeedToCorrect=F
}
}
if(count_corrected==0) tmp = " everything is ok! \n"
if(count_corrected==1) tmp = paste0(" ", count_corrected," p-value has been re-estimated using the exact formula. \n")
if(count_corrected>1) tmp = paste0(" ", count_corrected, " p-values have been re-estimated using the exact formula. \n")
Log = update_log(Log, tmp, verbose)
}
res = list()
res$pValue = d$my_p
res$log_info = Log
return(res)
}
PVal = get_pValue(Prior$BF, Prior, sign_thresh, verbose)
Prior %>%
mutate(BF_p = PVal$pValue) -> Prior
Log=c(Log, PVal$log_info)
}
if(sign_method == "fdr"){
tmp = "# Estimating FDR (Benjamini-Hochberg procedure)... \n"
Log = update_log(Log, tmp, verbose)
Prior %>%
mutate(BF_fdr = stats::p.adjust(.data$BF_p)) -> Prior
tmp = "Done! \n"
Log = update_log(Log, tmp, verbose)
}
### add p-values for posterior / direct effects
tmp = "# Estimating p-values for posterior effects... \n"
Log = update_log(Log, tmp, verbose)
Prior %>%
mutate(p_posterior = 2 * stats::pnorm(-abs(.data$z_posterior))) -> Prior
if(sign_method == "fdr"){
tmp = "# Estimating FDR (Benjamini-Hochberg procedure) for posterior effects... \n"
Log = update_log(Log, tmp, verbose)
Prior %>%
mutate(fdr_posterior = stats::p.adjust(.data$p_posterior)) -> Prior
}
tmp = "Done! \n"
Log = update_log(Log, tmp, verbose)
tmp = "# Estimating p-values for direct effects... \n"
Log = update_log(Log, tmp, verbose)
Prior %>%
mutate(p_direct = 2 * stats::pnorm(-abs(.data$z_direct))) -> Prior
if(sign_method == "fdr"){
tmp = "# Estimating FDR (Benjamini-Hochberg procedure) for direct effects... \n"
Log = update_log(Log, tmp, verbose)
Prior %>%
mutate(fdr_direct = stats::p.adjust(.data$p_direct)) -> Prior
}
tmp = "Done! \n"
Log = update_log(Log, tmp, verbose)
if(save_files){
system("rm Prior.csv")
readr::write_csv(Prior, "PriorBFp.csv")
tmp = "The file Prior.csv has been updated into Prior_BFp.csv \n"
Log = update_log(Log, tmp, verbose)
}
res=list(log_info = Log,
SNPs = Prior)
return(res)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.