#' @title Pruning+thresholding polygenic risk score creation based on summary statistics
#'
#' @description Create a polygenic risk score based on summary statistics from prior GWAS/pQTL discovery studies.
#' Included variants in high LD can be decorrelated to prevent double-counting, using the conditional argument.
#'
#' @param variant_data An object of format output by extract_variants().
#' @param gwas_info An object generated by get_trait_variants() or get_pQTLs().
#' @param remove_indels If TRUE, removes indels.
#' @param imp_threshold Imputation quality threshold, based on R^2. Any variant with lower imputation R^2 is removed.
#' @param binary_outcome Set to TRUE for binary traits, and FALSE for continuous outcomes (including pQTLs).
#' @param exclude_extreme_associations If TRUE, removes variants with an odds ratio > 5 or <1/5.
#' @param maf_filter Variants with a MAF below the specified threshold are filtered.
#' @param LDplot If TRUE, plots the LD matrix (squared correlation matrix of variants).
#' @param pruning_threshold Variants in LD >= pruning_threshold with other variants are removed.
#' @param pruning_filter The criterion by which to keep one variant of a high LD pair. By default, it keeps the variant with the lower p-value, but keeping the variant with higher MAF is also possible.
#' @param pval_threshold Variants with GWAS p-values > pval_threshold are discarded. Set to 1 to turn off.
#' @param conditional If TRUE, uses marg2con() to decorrelate variants in LD if these are within a given bp distance on the same chromosome.
#' @param cond_window A genomic distance within which to decorrelate variants in LD, in base pair.
#' @param cond_N The sample size of the original GWAS from which the marginal estimates were derived, or an approximation of it.
#' @param cond_stepwise If TRUE, iteratively applies conditional analysis conditioning on the top variant within the specified window, each time removing variants for which p > pval_threshold, until only conditionally independent variants remain. When set to FALSE, conditional joint analysis (i.e. of several variants jointly) is applied within the specified window.
#' @param ridge If TRUE, applies a ridge penalty. This only applies when conditional is set to TRUE.
#' @param lambda The parameter controlling the degree of ridge regularization.
#' @param scale Centers and standardizes the polygenic risk score if TRUE.
#' @param flowchart If TRUE, plots a flowchart describing the creation of the polygenic risk score.
#'
#' @return A list containing several data.frames with all relevant information. The risk score is stored in element 'prs'.
#' @examples
#' # vte_prs <- create_prs(vte_extracted_variants, vte_gwas_info)
#' # hist(vte_prs$prs$prs)
#' @export
#' @importFrom PRISMAstatement "flow_exclusions"
#' @importFrom metafor "rma"
create_prs <- function (variant_data,
gwas_info,
remove_indels = FALSE,
imp_threshold = 0.8,
binary_outcome = TRUE,
exclude_extreme_associations = TRUE,
maf_filter = 0.001,
LDplot = FALSE,
pruning_threshold = 0.75,
pruning_filter = 'p',
pval_threshold = 5e-08,
conditional = FALSE,
cond_window = 35000,
cond_N = 60000,
cond_stepwise = TRUE,
ridge = FALSE,
lambda = 0,
scale = FALSE,
flowchart = TRUE)
{
start_time <- Sys.time()
res_list <- list()
e <- list()
if (remove_indels == TRUE) {
ndif <- nchar(variant_data$fix$REF) - nchar(variant_data$fix$ALT)
if (any(ndif > 0)) {
rind <- which(ndif > 0)
variant_data$fix <- variant_data$fix[-rind, ]
variants_left <- variant_data$fix$ID
variant_data$gt <- variant_data$gt[, which(colnames(variant_data$gt) %in% variants_left)]
}
}
dups <- which(duplicated(data.frame(variant_data$fix)$ID))
if (length(dups) > 0) {
variant_data$fix <- variant_data$fix[-c(dups, dups - 1), ]
variant_data$gt <- variant_data$gt[-c(dups, dups - 1), ]
cat("> Removed", length(dups), "duplicate variants (e.g. triallelic SNPs).\n")
}
v <- variant_data$fix
unique_variants <- unique(v$ID)
no_of_variants <- length(unique_variants)
cat("> A total of", no_of_variants, "variants was retrieved.\n")
e$variants_retrieved <- no_of_variants
drop_check <- function(v) {
if (nrow(v) == 0)
stop("All variants have been dropped. Try different tuning parameters.")
}
v$imp_qual <- v$DR2
low_imp_qual_l <- length(unique(v$ID[v$imp_qual < imp_threshold]))
v <- subset(v, imp_qual >= imp_threshold)
cat("> Dropped", low_imp_qual_l, "variants with imputation R^2 below threshold.\n")
e$low_imp_qual <- low_imp_qual_l
v <- v[, -c(6:10)]
unique_variants <- unique(v$ID)
drop_check(v)
varv <- suppressWarnings(sapply(variant_data$gt, sd))
novar <- names(which(is.na(varv) | varv == 0))
variant_data$gt <- variant_data$gt[, !colnames(variant_data$gt) %in% novar]
v <- subset(v, !ID %in% novar)
g <- gwas_info
g <- subset(g, !variant_id %in% novar)
l0 <- length(unique_variants) - length(unique(v$ID))
unique_variants <- unique(v$ID)
cat("> Dropped", l0, "variants with zero variance in the data.\n")
e$no_variance <- l0
drop_check(v)
g <- subset(g, variant_id %in% unique_variants)
l1 <- length(unique(v$ID[v$ID %in% unique(g$variant_id)]))
cat("> Dropped", length(unique_variants) - l1, "variants that did not match with gwas_info.\n")
e$missing_from_gwasinfo <- length(unique_variants) - l1
g
if (binary_outcome == TRUE) {
g <- subset(g, !is.na(or_per_copy_number))
}
else {
g <- subset(g, !is.na(beta_coefficient))
}
g <- g[!duplicated(g), ]
l2 <- length(unique(v$ID[v$ID %in% unique(g$variant_id)]))
cat("> Dropped", l1 - l2, "variants with missing effect sizes.\n")
e$missing_effectsize <- l1 - l2
if (binary_outcome == TRUE) {
g$effect_size <- log(g$or_per_copy_number)
}
else {
g$effect_size <- g$beta_coefficient
}
g <- subset(g, !is.na(risk_allele))
l3 <- length(unique(v$ID[v$ID %in% unique(g$variant_id)]))
cat("> Dropped", l2 - l3, "variants with missing risk alleles.\n")
e$missing_risk_allele <- l2 - l3
drop_check(v)
snp_ind <- aggregate(effect_size ~ variant_id, g, function(x) length(unique(x)))
multiOR_l <- sum(as.numeric(snp_ind[, 2] > 1))
id_list <- list()
for (i in 1:length(unique(snp_ind$variant_id))) {
meta <- g[g$variant_id == unique(snp_ind$variant_id)[i], ]
if (nrow(meta) > 1) {
if (0 %in% meta$pvalue) {
meta$pvalue[which(meta$pvalue == 0)] <- min(meta$pvalue[meta$pvalue > 0])
}
meta$standard_error <- ifelse(is.na(meta$standard_error),
abs(meta$effect_size/qnorm(meta$pvalue/2)), meta$standard_error)
meta_est <- suppressWarnings(metafor::rma(yi = meta$effect_size,
sei = meta$standard_error))
}
id_list[[i]] <- g[g$variant_id == unique(snp_ind$variant_id)[i], ][1, ]
if (nrow(meta) > 1) {
id_list[[i]]$effect_size <- meta_est$beta
id_list[[i]]$standard_error <- meta_est$se
}
}
g <- do.call("rbind", id_list)
cat("> In case of multiple effect sizes (there were", multiOR_l,
"occurrences), used meta-analytic estimate for pooled effect size.\n")
e$multiple_effectsizes <- multiOR_l
v <- subset(v, ID %in% unique(g$variant_id))
g <- subset(g, variant_id %in% unique(v$ID))
unique_variants <- unique(v$ID)
radf <- list()
for (i in 1:length(unique_variants)) {
RA_g <- g$risk_allele[g$variant_id == unique_variants[i]]
RA_v <- v$ALT[v$ID == unique_variants[i]]
radf[[i]] <- c(RA_g, RA_v)
}
radf <- data.frame(do.call("rbind", radf))
colnames(radf) <- c("risk_allele_gwas", "risk_allele_vcf")
rownames(radf) <- unique_variants
radf$ref_allele_vcf <- v$REF[v$ID %in% rownames(radf)]
radf$complement_base_gwas <- rep(NA, nrow(radf))
radf$complement_base_vcf <- rep(NA, nrow(radf))
radf$complement_base_vcf_ref <- rep(NA, nrow(radf))
strandflip <- function(e) switch(e, A = "T", T = "A", C = "G", G = "C")
for (i in 1:nrow(radf)) {
s1 <- strandflip(radf$risk_allele_gwas[i])
s2 <- strandflip(radf$risk_allele_vcf[i])
s3 <- strandflip(radf$ref_allele_vcf[i])
radf$complement_base_gwas[i] <- ifelse(is.null(s1), NA, s1)
radf$complement_base_vcf[i] <- ifelse(is.null(s2), NA, s2)
radf$complement_base_vcf_ref[i] <- ifelse(is.null(s3), NA, s3)
}
radf$different <- radf$risk_allele_gwas != radf$risk_allele_vcf &
radf$complement_base_gwas != radf$risk_allele_vcf
radf$same_as_ref <- radf$risk_allele_gwas == radf$ref_allele_vcf |
radf$risk_allele_gwas == radf$complement_base_vcf_ref
radf$reverse_sign <- ifelse(radf$different == FALSE, 0, 1)
reverse_sign_SNPs <- rownames(radf)[which(radf$reverse_sign == 1)]
cat(">", length(reverse_sign_SNPs), "variants had opposite risk allele coding with the GWAS catalog.\n")
e$opposite_allele_coding <- length(reverse_sign_SNPs)
g$effect_size_final <- g$effect_size
reverse_ind <- which(g$variant_id %in% reverse_sign_SNPs)
g$effect_size_final[reverse_ind] <- -1 * g$effect_size_final[reverse_ind]
cat(">", length(reverse_ind), "effect sizes had their sign inverted. The new effect sizes are stored as effect_size_final.\n")
e$inverted_sign <- length(reverse_ind)
if (exclude_extreme_associations == TRUE) {
if (binary_outcome == TRUE) {
extreme_variants <- unique(g$variant_id[which(abs(log(g$or_per_copy_number)) > 1.6)])
g <- subset(g, !variant_id %in% extreme_variants)
v <- subset(v, !ID %in% extreme_variants)
if (length(extreme_variants) > 0) {
cat("> Dropped", length(extreme_variants), "variants with extreme ORs (>5 or <1/5).\n")
}
e$extreme_effectsize_variants_dropped <- length(extreme_variants)
unique_variants <- v$ID
drop_check(v)
}
else {
cat("> No variants dropped (exclude_extreme_associations only compatible with binary outcomes).\n")
e$extreme_effectsize_variants_dropped <- 0
}
}
d <- variant_data$gt
d <- d[, which(colnames(d) %in% unique_variants)]
## MAF filter
all_snps <- colnames(d)
mafs <- sapply(d[, all_snps],function(x) sum(x)/(2 * length(x)))
mafs <- ifelse(mafs > 0.5, 1 - mafs, mafs)
snp_maf <- data.frame(all_snps, mafs)
snp_keep <- subset(snp_maf, mafs >= maf_filter)
g <- subset(g, variant_id %in% snp_keep$all_snps)
v <- subset(v, ID %in% snp_keep$all_snps)
if (nrow(snp_keep) > 0) {
cat("> Dropped", nrow(snp_maf) - nrow(snp_keep), "variants with MAF below threshold.\n")
}
e$maf_filter_dropped <- nrow(snp_maf) - nrow(snp_keep)
unique_variants <- v$ID
drop_check(v)
d <- variant_data$gt
d <- d[, which(colnames(d) %in% unique_variants)]
## start pruning
LD_dat <- cor(d)
LD <- LD_dat
if (LDplot == TRUE) {
heatmap(LD)
}
pruning_done <- FALSE
pruned_set <- c()
pruning_counter <- 0
while (pruning_done == FALSE) {
pruning_counter <- pruning_counter + 1
cat('> Pruning round',pruning_counter,'..\n')
LD <- cor(d)
LD2 <- round(LD^2, 2)
LD2 <- LD2 * lower.tri(LD2)
LDtab <- which(LD2 > pruning_threshold, arr.ind = T)
row_snps <- rownames(LD2)[LDtab[, 1]]
col_snps <- rownames(LD2)[LDtab[, 2]]
## Prune based on p-value, else based on MAF
if(pruning_filter == 'p'){
row_p <- col_p <- length(row_snps)
for(i in 1:length(row_snps)){
row_p[i] <- min(g$pvalue[g$variant_id == row_snps[i]], na.rm=TRUE)
col_p[i] <- min(g$pvalue[g$variant_id == col_snps[i]], na.rm=TRUE)
}
p_df <- data.frame(row_p, col_p)
which_min <- unlist(apply(p_df, 1, which.min))
which_keep <- unique(ifelse(which_min == 1, row_snps,
col_snps))
}
else{
if(pruning_filter!='maf'){cat('Pruning filter not recognized, pruning based on MAF..\n')}
if(length(row_snps)>1){
row_mafs <- sapply(d[, row_snps], function(x) sum(x)/(2 * length(x)))
}
else{row_mafs <- sum(d[,row_snps])/(2*length(d[,row_snps]))}
row_mafs <- ifelse(row_mafs > 0.5, 1 - row_mafs, row_mafs)
if(length(col_snps)>1){
col_mafs <- sapply(d[, col_snps], function(x) sum(x)/(2 * length(x)))
}
else{col_mafs <- sum(d[,col_snps])/(2*length(d[,col_snps]))}
col_mafs <- ifelse(col_mafs > 0.5, 1 - col_mafs, col_mafs)
maf_df <- data.frame(row_mafs, col_mafs)
which_max <- unlist(apply(maf_df, 1, which.max))
which_keep <- unique(ifelse(which_max == 1, row_snps,
col_snps))
}
remove_SNPs <- setdiff(union(row_snps, col_snps), which_keep)
pruned_set <- c(pruned_set, remove_SNPs)
d[, remove_SNPs] <- NULL
v <- v[!v$ID %in% remove_SNPs, ]
g <- subset(g, !variant_id %in% remove_SNPs)
LD <- cor(d)
LD2 <- round(LD^2, 2)
LD2 <- LD2 * lower.tri(LD2)
LDtab <- which(LDtab > pruning_threshold, arr.ind = T)
if (nrow(LDtab) == 0)
pruning_done <- TRUE
}
cat("> Dropped ", length(pruned_set), " variants in high LD (R^2>=",
pruning_threshold, ") with other variants.\n", sep = "")
e$high_LD_snps_pruned <- length(pruned_set)
drop_check(v)
# conditional analysis
if (conditional) {
pos <- v$POS
chr <- v$CHROM
g <- g[match(colnames(d), g$variant_id), ]
g2 <- g
d2 <- d
LD2 <- cor(d2)
for (i in 1:nrow(LD2)) {
for (j in 1:ncol(LD2)) {
chr_i <- chr[i]
chr_j <- chr[j]
if (chr_i != chr_j) {
LD2[i, j] <- 0
next
}
pos_i <- pos[i]
pos_j <- pos[j]
pos_dif <- abs(pos_i - pos_j)
if (pos_dif > cond_window) {
LD2[i, j] <- 0
}
else next
}
}
if(cond_stepwise == FALSE){
for (r in 1:nrow(g)) {
snp <- g$variant_id[r]
snp_cor <- LD2[r, ]
out <- sum(snp_cor != 0 & snp_cor != 1)
if (out == 0) {
if (ridge) {
sds <- sd(d[, r])
marg_beta <- c(g2$effect_size_final[r], 0)
covX <- diag(c(sds, 0))
cond_res <- marg2con(marginal_coefs = marg_beta,
covX = covX, N = cond_N, ridge = ridge, lambda = lambda,
binary = binary_outcome)
g$effect_size_final[r] <- cond_res$beta[1]
}
else next
}
else adj <- which(snp_cor != 0 & snp_cor != 1)
ind <- c(r, adj)
snps <- g$variant_id[ind]
se <- ifelse(is.na(g2$standard_error[ind]), abs(g2$effect_size[ind]/qnorm(g2$pvalue[ind]/2)),
g2$standard_error[ind])
if (sum(is.na(se) | is.nan(se)) > 0) {
cat("> One or several standard errors was missing, conditional analysis was skipped for this estimate.\n")
next
}
marg_beta <- g2$effect_size_final[ind]
covX <- cov(d2)[ind, ind]
if (ridge == FALSE) {
cond_res <- marg2con(marginal_coefs = marg_beta,
covX = covX, N = cond_N, estimate_se = TRUE,
marginal_se = se, binary = binary_outcome)
if (sum(is.nan(cond_res$se)) > 0) {
cat("> One or several standard errors was missing, conditional analysis was skipped for this estimate.\n")
next
}
g$effect_size_final[r] <- cond_res$beta[1]
g$standard_error[r] <- cond_res$se[1]
g$pvalue[r] <- cond_res$p[1]
cat("> Marginal estimates, standard errors and p-values were converted to conditional based on LD.\n")
}
else {
cond_res <- marg2con(marginal_coefs = marg_beta,
covX = covX, N = cond_N, ridge = ridge, lambda = lambda,
binary = binary_outcome)
g$effect_size_final[r] <- cond_res$beta[1]
cat("> Marginal estimates were converted to conditional based on LD, and ridge regularization was applied.\n")
}
}
}
if(cond_stepwise){
prem_SNPs2 <- c()
## each time pick the lowest p-val SNP and condition on this
## repeat until no SNPs can be dropped after conditional analysis
conditioning_done <- FALSE
while(conditioning_done == FALSE){
g2 <- subset(g2, variant_id %in% g$variant_id)
d2 <- d2[, which(colnames(d2) %in% colnames(d))]
g2 <- g2[match(colnames(d2), g2$variant_id), ]
LD2 <- cor(d2)
for (r in 1:nrow(g2)) {
snp <- g2$variant_id[r]
snp_cor <- LD2[r, ]
out <- sum(snp_cor != 0 & snp_cor != 1)
if (out == 0) {
if (ridge) {
sds <- sd(d2[, r])
marg_beta <- c(g2$effect_size_final[r], 0)
covX <- diag(c(sds, 0))
cond_res <- marg2con(marginal_coefs = marg_beta,
covX = covX, N = cond_N, ridge = ridge, lambda = lambda,
binary = binary_outcome)
g$effect_size_final[r] <- cond_res$beta[1]
}
else next
}
else adj <- which(snp_cor != 0 & snp_cor != 1)
ind <- c(r, adj)
snps <- g2$variant_id[ind]
sentinel <- g2$variant_id[adj][which.min(g2$pvalue[adj])]
if(length(sentinel)==0) next
ind2 <- ind
sentinel_ind <- which(g2$variant_id == sentinel[1])
ind <- c(r, sentinel_ind)
se <- ifelse(is.na(g2$standard_error[ind]), abs(g2$effect_size[ind]/qnorm(g2$pvalue[ind]/2)),
g2$standard_error[ind])
if (sum(is.na(se) | is.nan(se)) > 0) {
cat("> One or several standard errors was missing, conditional analysis was skipped for this estimate.\n")
next
}
marg_beta <- g2$effect_size_final[ind]
covX <- cov(d2[,ind])
if (ridge == FALSE) {
cond_res <- marg2con(marginal_coefs = marg_beta,
covX = covX, N = cond_N, estimate_se = TRUE,
marginal_se = se, binary = binary_outcome)
if (sum(is.nan(cond_res$se)) > 0) {
cat("> One or several standard errors was missing, conditional analysis was skipped for this estimate.\n")
next
}
g$effect_size_final[r] <- cond_res$beta[1]
g$standard_error[r] <- cond_res$se[1]
g$pvalue[r] <- cond_res$p[1]
}
else {
cond_res <- marg2con(marginal_coefs = marg_beta,
covX = covX, N = cond_N, ridge = ridge, lambda = lambda,
binary = binary_outcome)
g$effect_size_final[r] <- cond_res$beta[1]
cat("> Marginal estimates were converted to conditional based on LD, and ridge regularization was applied.\n")
}
}
prem_SNPs <- unique(g$variant_id[which(g$pvalue > pval_threshold)])
if (length(prem_SNPs) > 0) {
prem_SNPs2 <- c(prem_SNPs2, prem_SNPs)
g <- subset(g, pvalue <= pval_threshold)
d <- d[, -which(colnames(d) %in% prem_SNPs)]
v <- subset(v, !ID %in% prem_SNPs)
}
if (length(prem_SNPs) == 0){
conditioning_done <- TRUE
}
}
}
}
prem_SNPs <- unique(g$variant_id[which(g$pvalue > pval_threshold)])
if (length(prem_SNPs) > 0) {
g <- subset(g, pvalue <= pval_threshold)
d <- d[, -which(colnames(d) %in% prem_SNPs)]
v <- subset(v, !ID %in% prem_SNPs)
}
if(conditional & cond_stepwise) prem_SNPs <- c(prem_SNPs, prem_SNPs2)
cat("> Removed ", length(prem_SNPs), " variants by p-value thresholding (threshold=",
pval_threshold, ")\n", sep = "")
e$SNPs_removed_by_pval_thresholding <- length(prem_SNPs)
remaining_l <- length(unique(v$ID))
cat(">", remaining_l, "variants remaining.\n")
e$remaining_snps <- remaining_l
drop_check(v)
g <- g[match(colnames(d), g$variant_id), ]
effectsize <- as.matrix(g$effect_size_final)
prs <- as.matrix(d) %*% effectsize
cat("> Weighted polygenic score created using", length(effectsize), "SNPs.\n")
if (scale == TRUE) {
prs <- scale(prs)
cat("> Centered and standardized the score.\n")
}
e <- do.call("c", e)
res_list$process_log <- e
res_list$gwas_info <- g
res_list$variant_info <- v
res_list$allele_data <- d
res_list$risk_allele_df <- radf
res_list$sample_ids <- as.matrix(rownames(d))
res_list$prs <- data.frame(id = res_list$sample_ids, prs = prs)
rownames(res_list$prs) <- NULL
res_list$ld_matrix <- LD_dat
if (nrow(LDtab) > 0) {
LDtab2 <- data.frame(row_snps, col_snps)
res_list$ld_pairs <- LDtab2
}
cat("> Returned a list of results with the following dimensions:\n")
print(lapply(res_list, dim))
end_time <- Sys.time()
cat("> Creation of the PRS took", difftime(end_time, start_time,
units = "secs"), "seconds.\n")
if (flowchart == TRUE) {
print(PRISMAstatement::flow_exclusions(incl_counts = c(e[1],
e[1] - e[2],
e[1] - sum(e[2:3]),
e[1] - sum(e[2:4]),
e[1] - sum(e[2:6]),
e[1] - sum(e[2:6]) - e[10],
e[1] - sum(e[2:6]) - sum(e[10:11]),
e[1] - sum(e[2:6]) - sum(e[10:12]),
e[1] - sum(e[2:6]) - sum(e[10:13])),
total_label = "Total variants retrieved",
incl_labels = c("High imputation quality variants",
"Variants with non-zero variance", "Variants reported in GWAS catalog",
"Variants with complete information", "Variants with non-extreme effect sizes",
"Variants with sufficiently high MAF",
"Variants sufficiently independent", "Variants used in the weighted score"),
excl_labels = c(paste0("Low imputation quality (R^2<",
imp_threshold, ")"), "Variants with zero variance in the data",
"Variants not found in GWAS catalog information",
"Missing effect size or risk allele", "Variants with extreme effect sizes (99th percentile)",
paste0("Variants with MAF below the filter (",maf_filter,")"),
paste0("Variants in high LD (>",pruning_threshold,") with other included variants"),
paste0("Variants with p-value above threshold (",
pval_threshold, ")"))))
}
invisible(return(res_list))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.