#' Run mJAM with Forward Selection
#'
#' @description fitting mJAM-Forward
#'
#' @param N_GWAS A vector of sample sizes in all original GWAS studies.
#' @param X_ref A list of matrices with individual-level SNP dosage data in each study/population. Each column corresponds to a SNP. Note that the columns name should match exactly to the SNP column in `Marg_Result` and `EAF_Result`. If certain SNP(s) is missing in dosage, then insert NAs in corresponding column(s).
#' @param Marg_Result A data frame with marginal summary statistics from all studies. Col1: SNP name; Col2: Effect sizes from study #1; Col3: Std Errors of effect sizes from study #1; ...
#' @param EAF_Result A data frame with effect allele frequency (EAF) from all studies. Col1: SNP name; Col2: EAF from study #1; Col3: EAF from study #2; ...
#' @param condp_cut Threshold of conditional p-value to be considered as significant. No default specified. Usually recommend 5e-8.
#' @param index_snps User-defined index SNP(s), if any. Default is `NULL` which means mJAM-Forward will automatically select index variants.
#' @param within_pop_threshold Threshold of r2 with selected index SNP(s) within a single population. If a SNP's correlation with any selected index SNP is greater than this threshold in at least one population, it will be excluded from subsequent rounds of index SNP selection.
#' @param across_pop_threshold Threshold of r2 with selected index SNP(s) across all populations. If a SNP's correlation with any selected index SNP is greater than this threshold in all populations, it will be excluded from subsequent rounds of index SNP selection.
#' @param coverage The required coverage of credible sets. Default is 0.95.
#' @param Pr_Med_cut Cut off of mJAM posterior mediation probability (P(Med)) during credible set construction. Low P(Med) may indicate low correlation between the candidate SNP and the index SNP. Any candidate credible set SNPs with P(Med) < Pr_Med_cut will be not be considered for credible set. Default is 0.
#' @param filter_rare A logical variable indicating whether to filter rare SNPs before the analysis. Default is `FALSE.` If `TRUE`, then please specify `rare_freq`.
#' @param rare_freq A vector of frequencies between 0 and 0.5 to specify the minor allele frequency cut-off if you want to filter rare SNPs before the analysis. Please also set `filter_rare` to be TRUE. For example, if there are 3 populations, then rare_freq = c(0.01, 0, 0.01) means SNPs with MAF < 0.01 in pop 1 and MAF < 0.01 in pop 3 will be removed from analysis.
#' @param filter_unstable_est whether to filter variants with inconsistent estimate between mJAM and meta-analysis.
#' @param use_robust_var_est whether to use the robust estimate of residual variance (weighting between median and individual estimates).
#' @import tibble
#' @import dplyr
#'
#' @export
#'
#' @author Jiayi Shen
#'
#' @returns
#'
#' \describe{
#' \item{index}{A table listing all the selected index SNP(s) (`SNP`), along with their log10(p-value) conditional on all SNP(s) above (`cond_log10p`), the log10(p-value) conditional on all other index SNP(s) (`final_log10p`), and the p-value threshold used in this analysis (`pcut`).}
#' \item{cs}{A table recording various posterior probabilities of all SNPs being considered for credible set SNPs. }
#' \item{mJAM_marg_est}{A table with the marginal effect estimates and standard errors of all SNPs under the mJAM model.}
#' \item{QC_marg_est}{The complete table of marginal effect estimates using fixed-effect model and mJAM model. For QC purpose only.}
#' }
#'
# N_GWAS = c(5000, 5000, 5000)
# X_ref = list(RefDosage_P1,RefDosage_P2,RefDosage_P3)
# Marg_Result = MargBeta
# EAF_Result = EAF
# condp_cut = 0.05/50
# index_snps = c("rs2")
# within_pop_threshold = 0.50
# across_pop_threshold = 0.20
# coverage = 0.95
# Pr_Med_cut = 0
# filter_rare = FALSE
# rare_freq = NULL
# N_GWAS = N
# X_ref = ref.region
# Marg_Result = betase.region
# EAF_Result = eaf.region
# condp_cut = gwas.sig.cutoff
# index_snps = force.include
# within_pop_threshold = 0.10
# across_pop_threshold = 0.10
# Pr_Med_cut = 0
# coverage = 0.99
# filter_rare = TRUE
# rare_freq = rep(0.01, length(ref.region))
# filter_unstable_est = TRUE
mJAM_Forward <- function(N_GWAS, X_ref,
Marg_Result, EAF_Result,
condp_cut = NULL,
index_snps = NULL,
within_pop_threshold = 0.50,
across_pop_threshold = 0.20,
coverage = 0.95,
Pr_Med_cut = 0,
filter_rare = FALSE,
rare_freq = NULL,
filter_unstable_est = FALSE,
use_robust_var_est = FALSE){
## Set parameters
N_SNP <- numSNPs <- numSNPs_wo_rare <- nrow(Marg_Result)
if(is.null(condp_cut)){condp_cut <- 0.05/N_SNP}
## Check index_snps is in marker names
if(!is.null(index_snps) && sum(!(index_snps %in% Marg_Result$SNP))>0){
not_in_index_snps = index_snps[!(index_snps %in% Marg_Result$SNP)]
stop(paste0(paste0(not_in_index_snps, collapse = ","), "not found in Marg_Result."))
}
## Check whether columns of X_ref, EAF_Results and Marg_Result are aligned.
if(!all(sapply(X_ref, function(x) identical(colnames(x), colnames(X_ref[[1]]))))){
stop("Columns of X_ref are not aligned with each other. \n
Please make sure SNPs are sorted in the exact order.")
}
if(!identical(Marg_Result$SNP, EAF_Result$SNP)){
stop("Rows in Marg_Result are not aligned to EAF_Result in the exact order.")
}
if(!identical(Marg_Result$SNP, colnames(X_ref[[1]]))){
stop("Rows in Marg_Result are not aligned to X_ref in the exact order.")
}
## if filter_rare, then remove rare SNPs
if(typeof(filter_rare)!="logical"){
stop("Please specify filter_rare to be either TRUE or FALSE.")
}
if(filter_rare == TRUE){
## check rare frequency cutoff
if(is.null(rare_freq)){
stop("Please specify a vector of minor allele frequency thresholds (between 0 and 0.5).")
}else if(length(rare_freq)!=length(X_ref)){
stop("Length of rare_freq does not match with the number of populations")
}else{
rare_freq_matrix = matrix(rep(rare_freq,numSNPs), byrow = TRUE, nrow = numSNPs, ncol = length(rare_freq))
## filter SNPs whose summary stat MAF < rare_freq in "ALL" population
rare_freq_true1 = abs(EAF_Result[,2:ncol(EAF_Result)]-0.5)>0.5-rare_freq_matrix
rare_filter_id1 = which(rowMeans(rare_freq_true1,na.rm=T)==1)
## filter SNPs whose reference MAF < rare_freq in "ALL" population
reference_EAF = matrix(NA, nrow = numSNPs, ncol = length(X_ref))
for(k in 1:length(X_ref)){
reference_EAF[,k] <- colMeans(X_ref[[k]],na.rm=T)/2
}
rare_freq_true2 = abs(reference_EAF-0.5)>0.5-rare_freq_matrix
rare_filter_id2 = which(rowMeans(rare_freq_true2,na.rm=T)==1)
rare_filter_id = union(rare_filter_id1, rare_filter_id2)
if(length(rare_filter_id)>0){
## filter Marg_Result and EAF_Result and X_ref
Marg_Result = Marg_Result[-rare_filter_id,]
EAF_Result = EAF_Result[-rare_filter_id,]
for(i in 1:length(X_ref)){X_ref[[i]] = X_ref[[i]][,-rare_filter_id]}
numSNPs_wo_rare = numSNPs - length(rare_filter_id)
}
}
}
Original_Input_Dosage <- X_ref
Input_MarglogOR <- Input_MargSElogOR <- Input_MAF <- feMeta_w <- feMeta_w_x_beta <- vector("list", length(X_ref))
for(pop in 1:length(X_ref)){
Input_MarglogOR[[pop]] <- Marg_Result[,2*pop]
Input_MargSElogOR[[pop]] <- Marg_Result[,2*pop+1]
Input_MAF[[pop]] <- EAF_Result[,1+pop]
names(Input_MarglogOR[[pop]]) <- Marg_Result[,1]
names(Input_MargSElogOR[[pop]]) <- Marg_Result[,1]
names(Input_MAF[[pop]]) <- EAF_Result[,1]
feMeta_w[[pop]] <- 1/Marg_Result[,2*pop+1]^2
feMeta_w_x_beta[[pop]] <- feMeta_w[[pop]]*Marg_Result[,2*pop]
}
feMeta_se <- sqrt(1/rowSums(do.call(cbind, feMeta_w), na.rm = TRUE))
feMeta_mean <- rowSums(do.call(cbind, feMeta_w_x_beta), na.rm = TRUE)/rowSums(do.call(cbind, feMeta_w), na.rm = TRUE)
## --- Check missing data
numEthnic <- length(Input_MarglogOR)
missing_tbl <- tibble(missing_ethnic_idx = integer(), missing_snp_idx = integer())
for(i in 1:numEthnic){
temp_missing_gwas <- union(which(is.na(Input_MarglogOR[[i]])), which(is.na(Input_MargSElogOR[[i]])))
temp_missing_gwas <- union(temp_missing_gwas, which(is.na(Input_MAF[[i]])))
temp_missing_dosage <- which(colSums(is.na(X_ref[[i]]))>0)
if(length(temp_missing_gwas)>0){
missing_tbl <- missing_tbl %>% add_row(missing_ethnic_idx =i, missing_snp_idx = temp_missing_gwas)
}
if(length(temp_missing_dosage)>0){
missing_tbl <- missing_tbl %>% add_row(missing_ethnic_idx =i, missing_snp_idx = temp_missing_dosage)
}
}
missing_tbl <- distinct(missing_tbl)
## --- Calculate the LD matrix of X_ref
Dosage_cor <- vector("list",length(X_ref))
for(i in 1:length(X_ref)){
if (nrow(missing_tbl) > 0 && i%in%missing_tbl$missing_ethnic_idx){
## --- Get missing SNP index
temp_missing_snp_idx <- filter(missing_tbl, missing_ethnic_idx == i) %>% pull(missing_snp_idx)
if(length(temp_missing_snp_idx)<ncol(X_ref[[i]])-1){
## --- Get Dosage_cor with complete SNP
temp_Dosage_cor <- cor(X_ref[[i]][,-temp_missing_snp_idx])^2
## --- Fill in missing SNPs with zeros
Dosage_cor[[i]] <- diag(1, nrow = numSNPs_wo_rare, ncol = numSNPs_wo_rare)
Dosage_cor[[i]][-temp_missing_snp_idx, -temp_missing_snp_idx] <- temp_Dosage_cor
}else{
Dosage_cor[[i]] <- diag(1, nrow = numSNPs_wo_rare, ncol = numSNPs_wo_rare)
}
}else{
Dosage_cor[[i]] <- cor(X_ref[[i]])^2
}
}
## -- Calculate sufficient statistics
GItGI <- GIty <- yty <- yty_med <- vector("list", numEthnic)
for (i in 1:numEthnic){
if (nrow(missing_tbl) > 0 && i%in%missing_tbl$missing_ethnic_idx){
## --- Get missing SNP index
temp_missing_snp_idx <- filter(missing_tbl, missing_ethnic_idx == i) %>% pull(missing_snp_idx)
## --- Get GItGI, GIty, yty with complete SNP
temp_X_ref <- X_ref[[i]][,-temp_missing_snp_idx]
temp_MAFs <- Input_MAF[[i]][-temp_missing_snp_idx]
temp_GItGI <- get_XtX(N_outcome = N_GWAS[i], Gl = temp_X_ref, maf = temp_MAFs)
##
temp.marginal.betas <- Input_MarglogOR[[i]][-temp_missing_snp_idx]
temp.marginal.se <- Input_MargSElogOR[[i]][-temp_missing_snp_idx]
temp.GIty <- get_z(maf = temp_MAFs, betas = temp.marginal.betas, N_outcome = N_GWAS[i])
##
# yty[[i]] <- get_yty(maf = temp_MAFs, N_outcome = N_GWAS[i], betas = temp.marginal.betas, betas.se = temp.marginal.se)
## --- Fill in missing SNPs with zeros
GItGI[[i]] <- matrix(0, nrow = numSNPs_wo_rare, ncol = numSNPs_wo_rare)
GItGI[[i]][-temp_missing_snp_idx, -temp_missing_snp_idx] <- temp_GItGI
GIty[[i]] <- rep(0, numSNPs_wo_rare)
GIty[[i]][-temp_missing_snp_idx] <- temp.GIty
}else{
GItGI[[i]] <- get_XtX(N_outcome = N_GWAS[i], Gl = X_ref[[i]],
maf = Input_MAF[[i]])
GIty[[i]] <- get_z(maf = Input_MAF[[i]],
betas = Input_MarglogOR[[i]], N_outcome = N_GWAS[i])
}
temp_yty <- get_yty(maf = Input_MAF[[i]], N_outcome = N_GWAS[i],
betas = Input_MarglogOR[[i]],
betas.se = Input_MargSElogOR[[i]])
yty[[i]] <- temp_yty$yTy.all
yty_med[[i]] <- temp_yty$yTy.est
colnames(GItGI[[i]]) <- rownames(GItGI[[i]]) <- Marg_Result[,1]
names(GIty[[i]]) <- names(yty[[i]]) <- Marg_Result[,1]
}
## Identify rare variants
rare_pct_sumstats <- rowMeans(abs(EAF_Result[,2:ncol(EAF_Result)]-0.5)>0.48|is.na(EAF_Result[,2:ncol(EAF_Result)]))
rare_SNPs_sumstats <- Marg_Result[which(rare_pct_sumstats>=0.5),1]
reference_EAF = matrix(NA, nrow = numSNPs_wo_rare, ncol = length(X_ref))
for(k in 1:length(X_ref)){
reference_EAF[,k] <- colMeans(X_ref[[k]],na.rm=T)/2
}
rare_pct_dosage <- rowMeans(abs(reference_EAF-0.5)>0.48|is.na(reference_EAF))
rare_SNPs_dosage <- Marg_Result[which(rare_pct_dosage>=0.5),1]
rare_SNPs <- union(rare_SNPs_sumstats, rare_SNPs_dosage)
if(length(rare_SNPs)==0){rare_SNPs <- NULL}
## Run Forward selection
iter_count <- 0
selected_ids <- NULL
prev_index_snp <- NULL
pruned_snps <- NULL
condp_list <- NULL
curr_min_condp <- 0
prev_min_condp <- 0
all_CS <- NULL
subset_EUR <- has_dosage_SNP <- Marg_Result$SNP
GItGI_curr <- GItGI
GIty_curr <- GIty
yty_curr <- yty
## --- Flag unstable estimates
## get the mJAM marginal estimates
if(sum(rare_SNPs %in% Marg_Result$SNP)>0){
rare_id = match(rare_SNPs, Marg_Result$SNP)
rare_id = rare_id[!is.na(rare_id)]
}else{
rare_id = NULL
}
newFS_RES <- mJAM_get_condp(GItGI = GItGI, GIty = GIty, yty = yty,
yty_med = yty_med, N_GWAS = N_GWAS, selected_id = NULL,
use_robust_var_est = use_robust_var_est,
use_median_yty_ethnic = NULL, rare_id = rare_id)
## output mJAM marginal p and meta marginal p
marginal_est <- tibble(SNP = Marg_Result$SNP,
mJAM_effect = signif(newFS_RES$effect_est, digits = 3),
feMeta_effect = feMeta_mean,
mJAM_se = signif(newFS_RES$se_est, digits = 3),
feMeta_se = feMeta_se) %>%
mutate(mJAM_logp = Rmpfr::pnorm(abs(mJAM_effect/mJAM_se),lower.tail = F,log.p = T)+log(2),
feMeta_logp = Rmpfr::pnorm(abs(feMeta_effect/feMeta_se),lower.tail = F,log.p = T)+log(2)) %>%
mutate(mJAM_log10p = signif(mJAM_logp/log(10), 4),
feMeta_log10p = signif(feMeta_logp/log(10),4)) %>%
mutate(unstable_est = (abs(mJAM_log10p - feMeta_log10p)>2)&(abs(mJAM_effect - feMeta_effect)/abs(feMeta_effect)>0.5))
if(filter_unstable_est){
unstable_SNPs = marginal_est %>% filter(unstable_est) %>% pull(SNP)
## remove unstable SNPs from analysis
if(length(unstable_SNPs)>0){
cat("Removed", length(unstable_SNPs), "SNPs with unstable estimates.\n")
Marg_Result <- Marg_Result %>% filter(!(SNP %in% unstable_SNPs))
subset_EUR <- has_dosage_SNP <- Marg_Result$SNP
for(e in 1:numEthnic){
GItGI_curr[[e]] <- GItGI[[e]][subset_EUR, subset_EUR]
GIty_curr[[e]] <- GIty[[e]][subset_EUR]
yty_curr[[e]] <- yty[[e]][subset_EUR]
colnames(GItGI_curr[[e]]) <- rownames(GItGI_curr[[e]]) <- subset_EUR
names(GIty_curr[[e]]) <- names(yty_curr[[e]]) <- subset_EUR
Dosage_cor[[e]] <- Dosage_cor[[e]][has_dosage_SNP,has_dosage_SNP]
}
}
}
while(iter_count >= 0){
## step 1: select top SNPs in the remaining list
## selected_id should be the ID in subset_EUR
if(length(unique(pruned_snps))==numSNPs_wo_rare){break}
if(iter_count == 0){Input_id = NULL}else{Input_id = match(selected_ids, subset_EUR)}
## get the id of rare SNPs in remaining SNPs
if(sum(rare_SNPs %in% subset_EUR)>0){
rare_id = match(rare_SNPs, subset_EUR)
rare_id = rare_id[!is.na(rare_id)]
}else{
rare_id = NULL
}
## get the conditional p-values of all remaining SNPs
newFS_RES <- mJAM_get_condp(GItGI = GItGI_curr, GIty = GIty_curr, yty = yty_curr,
yty_med = yty_med, N_GWAS = N_GWAS, selected_id = Input_id,
use_robust_var_est = use_robust_var_est,
use_median_yty_ethnic = NULL, rare_id = rare_id)
## determine the index SNP of current round
if(is.null(index_snps)){
if((newFS_RES$condp_min>log(condp_cut))) {break}
curr_index_snp <- subset_EUR[newFS_RES$which_condp_min]
}else{
if(iter_count >= length(index_snps)) {break}
curr_index_snp <- index_snps[iter_count+1]
}
selected_ids <- c(selected_ids, curr_index_snp)
condp_list <- c(condp_list, newFS_RES$condp[match(curr_index_snp, subset_EUR)])
message(paste("No.", iter_count+1,"selected index SNP:", curr_index_snp))
## step 2: construct credible sets based on the selected SNP
curr_CS <- mJAM_build_CS(X_id = curr_index_snp,
prev_X_list = prev_index_snp,
All_id = subset_EUR,
PrCS_weights = "Pr(M_C)",
coverage = coverage,
GItGI_curr = GItGI_curr, GIty_curr = GIty_curr,
yty_curr = yty_curr, yty_med = yty_med,
N_GWAS = N_GWAS, rare_SNPs = rare_SNPs,
Pr_Med_cut = Pr_Med_cut,
use_robust_var_est = use_robust_var_est)
## step 3: prune out CS SNPs and SNPs in LD with index SNP; subset input statistics
curr_CS_snp <- filter(curr_CS, CS_in == T) %>% pull(CS_SNP)
pruned_snps <- c(pruned_snps, curr_index_snp)
curr_LD_snp <- mJAM_LDpruning(target = match(curr_index_snp,has_dosage_SNP),
testing = match(has_dosage_SNP[-match(pruned_snps,has_dosage_SNP)],has_dosage_SNP),
R = Dosage_cor,
within_thre = within_pop_threshold, across_thre = across_pop_threshold)
pruned_snps <- c(pruned_snps,has_dosage_SNP[c(curr_LD_snp$remove_within, curr_LD_snp$remove_across)])
all_CS <- rbind(all_CS, curr_CS)
## step 4: update input statistics
if(length(pruned_snps)>0){
subset_EUR <- has_dosage_SNP[-match(pruned_snps, has_dosage_SNP)]
}
subset_EUR <- union(subset_EUR, selected_ids)
subset_EUR <- has_dosage_SNP[has_dosage_SNP%in%subset_EUR]
message(paste(length(unique(pruned_snps))-length(unique(selected_ids)), "SNPs got pruned.", length(subset_EUR), "SNPs left."))
if(length(subset_EUR)>1){
for(e in 1:numEthnic){
GItGI_curr[[e]] <- GItGI[[e]][subset_EUR, subset_EUR]
GIty_curr[[e]] <- GIty[[e]][subset_EUR]
yty_curr[[e]] <- yty[[e]][subset_EUR]
colnames(GItGI_curr[[e]]) <- rownames(GItGI_curr[[e]]) <- subset_EUR
names(GIty_curr[[e]]) <- names(yty_curr[[e]]) <- subset_EUR
}
## step 5: continue to the next iteration
iter_count <- iter_count+1
prev_index_snp <- c(prev_index_snp, curr_index_snp)
message("Continue to next round of index SNP selection.")
}else{
## step 5: continue to the next iteration
iter_count <- iter_count+1
prev_index_snp <- c(prev_index_snp, curr_index_snp)
break
}
}
message("Analysis ended.")
if(!is.null(selected_ids)){
final_condp_selected <- mJAM_get_condp_selected(GItGI = GItGI, GIty = GIty, yty = yty,
N_GWAS = N_GWAS,yty_med = yty_med,
selected_id = selected_ids,
use_robust_var_est = use_robust_var_est,
rare_SNPs = rare_SNPs)
if(length(selected_ids)>1){
finalp = final_condp_selected$condp[,1]
}else{
finalp = final_condp_selected$condp
}
MULTI_index <- tibble(SNP = selected_ids,
b_joint = final_condp_selected$b_joint,
b_joint_var = ifelse(length(selected_ids)==1, final_condp_selected$b_joint_var,
diag(final_condp_selected$b_joint_var)),
cond_log10p = condp_list/log(10),
final_log10p = finalp/log(10),
pcut = condp_cut)
## simplify CS output table
all_CS <- all_CS %>%
dplyr::select(c(index_SNP, CS_SNP, Post_Model_Prob_Ratio2, Post_Med_Prob2, SD_Post_CS_Prob, CumSum_Porb, CS_in)) %>%
rename(Post_Model_Prob = Post_Model_Prob_Ratio2,
Post_Med_Prob = Post_Med_Prob2,
CumSum_Prob = CumSum_Porb)
}else{
message("No index SNP selected in this region.")
MULTI_index = NULL
all_CS = NULL
}
return(list(index = MULTI_index,
cs = all_CS,
mJAM_marg_est = marginal_est[,c(1,2,4,8)],
QC_marg_est = marginal_est))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.