# %%%%%%%%%%%%%%%%% #
####### Colocalization #######
# %%%%%%%%%%%%%%%%% #
# Jansen et al. 2017: https://eqtl.onderzoek.io/index.php?page=gene_cis_details&gene=BST1
#-------------------------------------------------------
#-------------------------------------------------------
# ECHOLOCATOR DOCUMENTATION
# The input for coloc is a merged data.frame with the summary stats from both the GWAS and QTL, with one unique SNP per row. Those columns should be named as follows:
#
# ## Data.frame
# # Shared columns
# SNP: unique identifier for SNP that uses the same naming system for both the GWAS and QTL (e.g. rsid (preferred), or chr:position).
# # GWAS columns
# P: Nominal P-values from the GWAS. Technically you should be able to supply corrected P-values as well (e.g. FDR) since coloc will turn them into
# Effect: Effect size (beta, or odds ratio) of each SNP in the GWAS
# StdErr: The standard error of each SNP in the GWAS (it should be non-squared since I square it in the coloc function).
# MAF: Minor allele frequency, preferably from the study itself but you can use a reference panel (e.g. 1000 Genomes) if necessary.
# # QTL columns
# QTL.P: Nominal P-values from the QTL.
# QTL.Effect: Effect size (beta, or odds ratio) of each SNP in the QTL
# QTL.StdErr: The standard error of each SNP in the QTL (it should be non-squared since I square it in the coloc function).
# QTL.MAF: Minor allele frequency, preferably from the study itself but you can use a reference panel (e.g. 1000 Genomes) if necessary, or even the MAF from the GWAS.
#
# ## Single-value parameters
# sample_size: In addition to this data.frame, you need to supply the sample_size for both the GWAS and QTL, respectively. Rather than one value per SNP, these will be just two values (number of subjects in the GWAS and number of subjects in the QTL). Technically, sample_size is an optional parameter for coloc but I find it actually can have a large impact on the results, so supplying an accurate number for this is still important.
# proportion_cases: The proportion of subjects that were cases (e.g. PD). This is only necessary for GWAS, since QTL doesn't really have a case-control design.
#-------------------------------------------------------
#-------------------------------------------------------
getmode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
COLOC.construct_dataset <- function(dat,
sample_size=NULL,
proportion_cases=NA, # Doesn't allow actual 0, so use smallest number R allows
MAF=NA,
type="cc"){
# Sample size
sample_size <- max(dat$N)
# Proportion cases
if(is.na(proportion_cases)){
if("proportion_cases" %in% colnames(dat)){
messager("++ Extracting Proportion of Cases...")
proportion_cases <- getmode(dat$proportion_cases)
}
}
# MAF
if(length(MAF)==1){
messager("++ Extracting MAF...")
if(is.na(MAF) & "MAF" %in% colnames(dat)){
MAF <- dat$MAF
}
}
# List form
dataset <- list(pvalues = dat$P,
beta = dat$Effect,
varbeta = dat$StdErr^2, # MUST be squared
snp = subset_DT$SNP,
N = sample_size, # [optional]
s = proportion_cases, # use overall proportions
MAF = MAF, # [required]
type = type)
return(dataset)
}
COLOC <- function(data1,
data2,
data1_type="cc",# case-control
data2_type="quant",
shared_MAF=FALSE,
data1_proportion_cases=NA,
data2_proportion_cases=5e-324,
data1_MAF="MAF",
data2_MAF=NA,
credset_thresh=0.8,
save_results=TRUE,
results_path=NULL,
show_plot=TRUE,
title1=NULL,
subtitle1=NULL,
title2=NULL,
subtitle2=NULL){
messager("******** Step 5: COLOCALIZE ********")
# The Approximate Bayes Factor colocalisation analysis described in the next section
## essentially works by fine mapping each trait under a single causal variant assumption
##and then integrating over those two posterior distributions to calculate probabilities that
## those variants are shared.
# https://cran.r-project.org/web/packages/coloc/vignettes/vignette.html
# data1 <- data.table::fread(dataset1_path, stringsAsFactors = FALSE, sep="\t") %>% data.frame()
# data2 <- data.table::fread(dataset2_path, stringsAsFactors = FALSE, sep="\t") %>% data.frame()
common_SNPs <- intersect(data1$SNP, data2$SNP)
data1 <- subset(data1, SNP %in% common_SNPs) %>% group_by(SNP) %>% slice(1) %>% data.frame()
data2 <- subset(data2, SNP %in% common_SNPs) %>% group_by(SNP) %>% slice(1) %>% data.frame()
dataset1 <- COLOC.construct_dataset(subset_DT = data1,
sample_size = data2_sample_size,
proportion_cases = data1_proportion_cases,
type = data1_type,
MAF = data1_MAF)
dataset2 <- COLOC.construct_dataset(subset_DT = data2,
sample_size = data2_sample_size,
proportion_cases = data2_proportion_cases,
type = data2_type,
MAF = data2_MAF)
if(shared_MAF==1){
data2$MAF <- data1$MAF
} else if(shared_MAF==2){
data1$MAF <- data2$MAF
}
## NOTES: MESA and Fairfax: No sample size (SNP-level), proportion of cases, or freq/MAF info available?
# messager("\n\n")
coloc.res <- coloc::coloc.abf(dataset1 = data1,
dataset2 = data2)
# MAF = dataset1$MAF)
hypothesis_key <- setNames(
c("Neither trait has a genetic association in the region.",
"Only trait 1 has a genetic association in the region.",
"Only trait 2 has a genetic association in the region.",
"Both traits are associated, but with different causal variants (one in each dataset).",
"Both traits are associated and share a single causal variant.") ,
c("PP.H0.abf","PP.H1.abf","PP.H2.abf","PP.H3.abf","PP.H4.abf"))
# Report hypothess results
messager("\n Hypothesis Results @ credset_thresh =",credset_thresh,":")
true_hyp <-""
for(h in names(hypothesis_key)){
if(coloc.res$summary[h]>=credset_thresh){
hyp <- hypothesis_key[h]
messager(" ",h,"== TRUE: **",hyp )
true_hyp <- paste0(names(hyp),": ", hyp)
} else{
messager(" ",h,"== FALSE: ")
}
}
coloc_DT <- coloc.res$results
# Find the causal SNP that coloc.abf identified in each dataset via finemap.abf
# DT1
causal_DT1 <- coloc_DT %>%dplyr::arrange(desc(lABF.df1))
causal_DT1 <- causal_DT1$snp[1]
# DT2
causal_DT2 <- coloc_DT %>%dplyr::arrange(desc(lABF.df2))
causal_DT2 <- causal_DT2$snp[1]
# Process results
coloc_DT$Colocalized <- ifelse(coloc_DT$SNP.PP.H4 >= credset_thresh, T, F)
colocalized_snps <- subset(coloc_DT, Colocalized==TRUE)$snp# subset(coloc_DT, Colocalized==1)$SNP
coloc_datasets <- coloc_plot_data(coloc.res, data1, data2)
subtitle2 <- paste0("Colocalized SNPs: ", paste(colocalized_snps,sep=", "))
if((coloc.res$summary["PP.H3.abf"] + coloc.res$summary["PP.H4.abf"] >= credset_thresh) &
(coloc.res$summary["PP.H4.abf"]/coloc.res$summary["PP.H3.abf"] >= 2)){
# "We called the signals colocalized when (coloc H3+H4 ≥ 0.8 and H4∕H3 ≥ 2)" -Yi et al. (2019)
report <- paste("Datasets colocalized")
} else {
report <- paste("Datasets NOT colocalized")
}
messager("\n++",report,"at PP.H3 + PP.H4 >=",credset_thresh," and PP.H3 / PP.H4 >= 2.","\n")
# Plot
if(show_plot){
# title1 <- paste(strsplit(dataset1_path,"/")[[1]][4],strsplit(dataset1_path,"/")[[1]][3])
# title2 <- paste(strsplit(dataset2_path,"/")[[1]][4],strsplit(dataset2_path,"/")[[1]][3])
COLOC.plot(coloc_DT1 = coloc_datasets$coloc_DT1,
coloc_DT2 = coloc_datasets$coloc_DT2,
title1 = title1,
subtitle1 = subtitle1,
title2 = title2,
subtitle2 = subtitle2,
# SNP_list = c("rs34637584","rs76904798","rs117073808"),
alt_color_SNPs = colocalized_snps,
show_plot = TRUE)
}
# Save
if(save_results){
coloc_path <- file.path(dirname(dataset1_path),"COLOC")
dir.create(coloc_path, recursive = TRUE, showWarnings = FALSE)
data.table::fwrite(coloc_DT, file.path(coloc_path,"COLOC_results.txt"), sep="\t")
data.table::fwrite(coloc_DT, file.path(results_path, "COLOC/COLOC_raw.txt"), sep="\t")
}
return(coloc_DT)
}
COLOC.plot_data <- function(coloc.res,
data1,
data2){
coloc_DT <- coloc.res$results
# Merge Dataset 1
coloc_DT1 <- coloc_DT %>% dplyr::select(SNP="snp",
V = "V.df1",
Z= "z.df1",
lABF = "lABF.df1",
H4.Probability = "SNP.PP.H4")
coloc_DT1 <- data.table:::merge.data.table(data1, coloc_DT1, on = "SNP", all = FALSE)
# Merge Dataset 2
coloc_DT2 <- coloc_DT %>% dplyr::select(SNP="snp",
V = "V.df2",
Z= "z.df2",
lABF = "lABF.df2",
H4.Probability = "SNP.PP.H4")
coloc_DT2 <- data.table:::merge.data.table(data2, coloc_DT2, on = "SNP", all = FALSE)
return(list(coloc_DT1=coloc_DT1, coloc_DT2=coloc_DT2))
}
COLOC.plot <- function(locus,
coloc_DT1,
coloc_DT2,
title1,
title2,
subtitle1,
subtitle2,
SNP_list=c(),
alt_color_SNPs=c(),
show_plot=TRUE){
mp1 <- manhattan_plot(subset_DT = coloc_DT1,
locus = locus,
SNP_list = SNP_list,
alt_color_SNPs = alt_color_SNPs,
title = title1,
subtitle = subtitle1,
show_plot = FALSE)
mp2 <- manhattan_plot(subset_DT = coloc_DT2,
locus = locus,
SNP_list = SNP_list,
alt_color_SNPs = alt_color_SNPs,
title = title2,
subtitle = subtitle2,
show_plot = FALSE)
cp <- patchwork::wrap_plots(mp1, mp2, ncol = 1)
if(show_plot){print(cp)}else{return(cp)}
}
COLOC.report_summary <- function(coloc.res, credset_thresh=.8){
# MAF = dataset1$MAF)
hypothesis_key <- setNames(
c("Neither trait has a genetic association in the region.",
"Only trait 1 has a genetic association in the region.",
"Only trait 2 has a genetic association in the region.",
"Both traits are associated, but with different causal variants (one in each dataset).",
"Both traits are associated and share a single causal variant.") ,
c("PP.H0.abf","PP.H1.abf","PP.H2.abf","PP.H3.abf","PP.H4.abf"))
# Report hypothess results
messager("Hypothesis Results @ credset_thresh =",credset_thresh,":")
true_hyp <-""
for(h in names(hypothesis_key)){
if(!is.na(coloc.res$summary[h])){
if(coloc.res$summary[h]>=credset_thresh){
hyp <- hypothesis_key[h]
messager(" ",h,"== TRUE: **",hyp )
true_hyp <- paste0(names(hyp),": ", hyp)
}
} else{
messager(" ",h,"== FALSE: ")
}
}
# Save raw results
coloc_DT <- coloc.res$results
# Process results
coloc_DT$Colocalized <- ifelse(coloc_DT$SNP.PP.H4 >= credset_thresh, T, F)
colocalized_snps <- subset(coloc_DT, Colocalized==TRUE)$snp# subset(coloc_DT, Colocalized==1)$SNP
subtitle2 <- paste0("Colocalized SNPs: ", paste(colocalized_snps,sep=", "))
if(!is.na(coloc.res$summary)["PP.H4.abf"] ){
if((coloc.res$summary["PP.H3.abf"] + coloc.res$summary["PP.H4.abf"] >= credset_thresh) &
(coloc.res$summary["PP.H4.abf"]/coloc.res$summary["PP.H3.abf"] >= 2)){
# "We called the signals colocalized when (coloc H3+H4 ≥ 0.8 and H4∕H3 ≥ 2)" -Yi et al. (2019)
report <- paste("Datasets colocalized")
} else {report <- paste("Datasets NOT colocalized") }
} else { report <- paste("Datasets NOT colocalized")}
messager("+ COLOC::",report,"at: PP.H3 + PP.H4 >=",credset_thresh," and PP.H3 / PP.H4 >= 2.")
return(coloc_DT)
}
# COLOC.plot_results <- function(coloc_DT){
# # Find the causal SNP that coloc.abf identified in each dataset via finemap.abf
# # DT1
# causal_DT1 <- coloc_DT %>%dplyr::arrange(desc(lABF.df1))
# causal_DT1 <- causal_DT1$snp[1]
# # DT2
# causal_DT2 <- coloc_DT %>%dplyr::arrange(desc(lABF.df2))
# causal_DT2 <- causal_DT2$snp[1]
# coloc_datasets <- coloc_plot_data(coloc.res, data1, data2)
# # Plot
# title1 <- paste(locus,":",strsplit(dataset1_path,"/")[[1]][4],strsplit(dataset1_path,"/")[[1]][3])
# title2 <- paste(locus,":",strsplit(dataset2_path,"/")[[1]][4],strsplit(dataset2_path,"/")[[1]][3])
#
# COLOC.plot(coloc_DT1 = coloc_datasets$coloc_DT1,
# coloc_DT2 = coloc_datasets$coloc_DT2,
# title1 = title1,
# subtitle1 = report,
# title2 = title2,
# subtitle2 = subtitle2,
# SNP_list = c("rs34637584","rs76904798","rs117073808"),
# alt_color_SNPs = colocalized_snps,
# show_plot = TRUE)
# }
COLOC.PP4_plot <- function(COLOC_DT=NULL, coloc.results_path=NULL, credset_thresh=.8){
if(is.null(COLOC_DT)){
if(is.null(coloc.results_path)){coloc.results_path <- "./Data/GWAS/Nalls23andMe_2019/_genome_wide/COLOC/COLOC_results_noFlip-gwasEffect.txt"}
COLOC_DT <- data.table::fread(coloc.results_path)
}
COLOC_DT$coloc <- (COLOC_DT$PP.H3.abf + COLOC_DT$PP.H4.abf >= credset_thresh) & (COLOC_DT$PP.H4.abf/COLOC_DT$PP.H3.abf >= 2)
COLOC_DT <- COLOC_DT %>% dplyr::rename(QTL.Dataset=Dataset2)
cp <- ggplot(subset(COLOC_DT, coloc==TRUE), aes(x=QTL.Dataset, y=PP.H4.abf, fill=QTL.Dataset)) +
facet_grid(~Locus) +
geom_col(show.legend = FALSE) +
coord_flip() +
theme_bw() +
scale_y_continuous(limits = c(0,1), breaks = c(0, 1)) +
theme(rect = element_rect(fill = "transparent"),
panel.background = element_rect(fill = "transparent"))
print(cp)
ggsave(file.path(dirname(coloc.results_path),"COLOC_multiQTL.png"),
plot = cp, dpi=400, width = 14, height=5, bg="transparent")
}
COLOC.iterate_QTL <- function(GTEx_version="GTEx_V7",
dataset.gwas.name = "./Data/GWAS/Nalls23andMe_2019"){
FM_all <- merge_finemapping_results(minimum_support = 0,
include_leadSNPs = TRUE,
dataset = dataset.gwas.name,
xlsx_path = FALSE)
QTL_files <- list.files(path = "./Data/QTL", pattern = "*.finemap.txt.gz", recursive = TRUE, full.names = TRUE)
QTL_files <- grep(paste(c("GTEx","Cardiogenics","MESA","psychENCODE"),collapse="|"), QTL_files, value = TRUE)
# QTL_files <- grep(paste(c("Fairfax"),collapse="|"), QTL_files, value = TRUE)
# qtl_file = QTL_files[28] # Need allele/MAF info: 1:4 (Brain_xQTL_Serve), 5:6 (Cardiogenics), 7:10 (Fairfax), 28:31 (psychENCODE) ##### (GTEx and MESA are good (tho MESA needs sample size))
COLOC_DT <- lapply(QTL_files, function(qtl_file){
dataset.qtl.name <- gsub(".finemap.txt.gz","",basename(qtl_file))
messager("")
message("mergeQTL:: ", dataset.qtl.name)
FM_merge <- mergeQTL.merge_handler(FM_all = FM_all, qtl_file = qtl_file)
subset(FM_merge, SNP %in% c("rs7294619","rs76904798","rs11175620")) %>% echodata::createDT()
## WARNING: DON'T try to allele flip. Coloc does this for you.
# FM_merge <- mergeQTL.flip_alleles(FM_merge) # NOTE!: Allele flipping makes a BIG difference.
## WARNING: Probably not valid to apply PP in this way since the model isn't intended to handle this.
## Instead, use something like PAINTOR that takes SNP-level PPs.
## Create measures adjusted by Posterior Probabilities from fine-mapping
# FM_merge$mean.PP <- rowMeans(subset(FM_merge, select=grep(".PP",colnames(FM_merge))))
# FM_merge$Adjusted.Effect <- FM_merge$Effect * FM_merge$mean.PP
coloc_dt <- lapply(unique(FM_merge$Gene), function(locus){
messager("+ COLOC::",locus)
FM_gene <- subset(FM_merge, Locus==locus)
FM_gene <- FM_gene[!is.na(FM_gene$QTL.Effect),]
default_results <- data.table::data.table(Locus=locus,
Dataset1=dataset.gwas.name,
Dataset2=dataset.qtl.name,
nSNPs_locus=nrow(FM_merge),
nSNPs_overlap=nrow(FM_gene),
PP.H0.abf=NA,
PP.H1.abf=NA,
PP.H2.abf=NA,
PP.H3.abf=NA,
PP.H4.abf=NA)
if(nrow(FM_gene)>0){
# GWAS
dataset.gwas <- list(pvalues = FM_gene$P,
beta = FM_gene$Effect,
varbeta = FM_gene$StdErr^2, # MUST be squared
snp = FM_gene$SNP,
N = max(FM_gene$N_cases, na.rm = TRUE) + max(FM_gene$N_controls, na.rm = TRUE), # [optional]
s = getmode(FM_gene$proportion_cases), # use overall proportions
MAF = FM_gene$MAF, # [required]
type = "cc")
# QTL
if(all(is.na(FM_gene$QTL.MAF))){FM_gene$QTL.MAF <- FM_gene$MAF; messager("+ COLOC:: No QTL.MAF given. Using GWAS.MAF instead.")}
FM_gene$QTL.MAF <- FM_gene$MAF ##############111111111111111
dataset.qtl <- list(pvalues = FM_gene$QTL.P,
beta = FM_gene$QTL.Effect,
varbeta = FM_gene$QTL.StdErr^2, # MUST be squared
snp = FM_gene$SNP,
N = max(FM_gene$QTL.SampleSize), # [optional]
# s = NA, # Not used for quant studies
#sdY = ,# [optional] for a quantitative trait, the population standard deviation of the trait. if not given, it can be estimated from the vectors of varbeta and MAF
MAF = FM_gene$QTL.MAF, # [required]
type = "quant")
# If regression coefficients and variances are available, it calculates Bayes factors for association at each SNP.
## If only p values are available, it uses an approximation that depends on the SNP's MAF and ignores any uncertainty
## in imputation. Regression coefficients should be used if available.
coloc.res <- coloc::coloc.abf(dataset1 = dataset.gwas,
dataset2 = dataset.qtl)
COLOC.report_summary(coloc.res, credset_thresh = .8)
# dat <- data.table::data.table(coloc.res$results)
dat <- data.table::data.table(t(coloc.res$summary))
dat <- cbind(Locus=locus,
Dataset1=dataset.gwas.name,
Dataset2=dataset.qtl.name,
nSNPs_locus=nrow(FM_merge),
dat)
dat <- dat %>% dplyr::rename(nSNPs_overlap=nsnps)
return(dat)
} else{
messager("COLOC:: No overlapping SNPs.")
return(default_results)
}
}) %>% data.table::rbindlist(fill=TRUE)
return(coloc_dt)
}) %>% data.table::rbindlist(fill=TRUE)
# Save
if(save_results){
coloc_path <- file.path(dataset.gwas.name,"_genome_wide","COLOC")
dir.create(coloc_path, recursive = TRUE, showWarnings = FALSE)
data.table::fwrite(COLOC_DT, file.path(coloc_path,"COLOC_results_noFlip-gwasEffect.txt"), sep="\t")
}
COLOC.PP4_plot(COLOC_DT, credset_thresh = .95)
return(COLOC_DT)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.