library(tidyverse)
#### GET ASSOCIATED TRAITS FROM GWAS CATALOG (USING SNPS IN LD) ####
if(!exists("ebicat37")) data(ebicat37, package = "gwascat")
# to use makeCurrent... need to load package before, requires data set 'si.hs.38'
library(gwascat)
if(!exists("my_ebicat37")) my_ebicat37 = gwascat::makeCurrentGwascat(genome='GRCh37')
get_associatedTraits <- function(rsid, chr, pos, LD=0.05, distance=100000, P=5e-8, gwascatdata=NULL){
require(gwascat)
### get GWAS catalog data
if(is.null(gwascatdata)){
#if(!exists("ebicat37")) data(ebicat37)
#if(!exists("my_ebicat37")) my_ebicat37 = makeCurrentGwascat(genome='GRCh37')
gwascatdata = makeCurrentGwascat(genome='GRCh37')
}
as_tibble(gwascatdata) %>%
transmute(snp = SNPS,
snp_with_riskAllele = STRONGEST.SNP.RISK.ALLELE,
chrm = CHR_ID,
posh19 = start, #CHR_POS -> GRCh38
LD_R2 = NA_real_,
LD_alleles = NA_character_,
trait = DISEASE.TRAIT,
#trait = MAPPED_TRAIT,
p = P.VALUE,
adjusted_p = NA_real_,
# effect = OR.or.BETA, # useless, no direction ...
effect = X95..CI..TEXT.,
gene = MAPPED_GENE,
url = LINK) -> gwascatdata_nice
### if SNP itself associated, get info
if(rsid %in% gwascatdata_nice$snp){
gwascatdata_nice %>%
filter(snp==rsid,
p<=P) %>%
arrange(trait, p)-> my_data
# keep only strongest association if multiple ones reported for same trait
my_data %>%
filter(!duplicated(trait)) -> my_data
}
### also test SNPs nearby
gwascatdata_nice %>%
filter(snp != rsid,
chrm == chr,
posh19 <= pos+distance,
posh19 >= pos-distance,
p<=P) -> snps_nearby
# get their LD with our main SNP
require(LDlinkR)
get_R2 <- function (snp1, snp2) {
d = capture.output(try(LDpair(snp1,
snp2,
pop="EUR",
token=Sys.getenv("LDLINK_TOKEN"),
output="text"), silent = TRUE))
# Tidy up data_out
r2 = strsplit(d[22], "\\s+")[[1]]
r2 = as.numeric(r2[3])
return(r2)
}
get_allele <- function (snp1, snp2) {
d = capture.output(try(LDpair(snp1,
snp2,
pop="EUR",
token=Sys.getenv("LDLINK_TOKEN"),
output="text"), silent = TRUE))
# Tidy up data_out
alignment = strsplit(d[27], " ")[[1]]
alignment = paste0(alignment[1], "/", alignment[6])
return(alignment)
}
snps_nearby %>%
filter(!duplicated(snp)) -> snps_nearby_unique
snps_nearby_unique$LD_R2 = apply(snps_nearby_unique, 1, function(x) suppressWarnings(get_R2(x[1], rsid)))
snps_nearby %>%
mutate(LD_R2 = snps_nearby_unique$LD_R2[match(snp, snps_nearby_unique$snp)]) -> snps_nearby
# keep only the ones in LD with significant association
snps_nearby %>%
filter(LD_R2>LD) %>%
mutate(adjusted_p = 2*pnorm(sqrt(LD_R2) * qnorm(0.5*p))) %>%
filter(adjusted_p<=P) %>%
arrange(trait, adjusted_p) -> snps_nearby
# if any trait(s) in common with SNP itself -> remove
if(exists("my_data")){
snps_nearby %>%
filter(!trait %in% my_data$trait) -> snps_nearby
}
# keep only strongest association if multiple ones reported for same trait
snps_nearby %>%
filter(!duplicated(trait)) -> snps_nearby
# add info about alleles in LD
snps_nearby %>%
filter(!duplicated(snp)) -> snps_nearby_unique
snps_nearby_unique$LD_alleles = apply(snps_nearby_unique, 1, function(x) suppressWarnings(get_allele(x[1], rsid)))
snps_nearby %>%
mutate(LD_alleles = snps_nearby_unique$LD_alleles[match(snp, snps_nearby_unique$snp)]) -> snps_nearby
if(exists("my_data")) bind_rows(my_data, snps_nearby) -> snps_nearby
snps_nearby %>%
mutate(snp=snp_with_riskAllele,
snp_with_riskAllele=NULL) -> snps_nearby
return(snps_nearby)
}
# # associated with T2D, multiple studies & blood pressure
# rsid="rs34872471"
# chr=10
# pos=114754071
# get_associatedTraits(rsid, chr, pos, gwascatdata = my_ebicat37) -> A
# # not in GWAS catalog, multiple SNPs nearby associated with several traits (at 500kb)
# rsid="rs7630554"
# chr=3
# pos=185526062
# get_associatedTraits(rsid, chr, pos, gwascatdata = my_ebicat37) -> B
#### GET GENES NAMES USING ENSEMBL ####
# not used actually, rather use annovar
# my_biomart = biomaRt::useMart(biomart="ENSEMBL_MART_ENSEMBL",
# host="grch37.ensembl.org", path="/biomart/martservice",
# dataset="hsapiens_gene_ensembl")
#
#
# get_gene <- function(chr, pos, biomart=NULL){
# require(biomaRt)
#
# ### get ensembl data
# if(is.null(biomart)){
# biomart = biomaRt::useMart(biomart="ENSEMBL_MART_ENSEMBL",
# host="grch37.ensembl.org", path="/biomart/martservice",
# dataset="hsapiens_gene_ensembl")
# }
#
# # get gene(s) at this exact position
# all.genes <- biomaRt::getBM(
# attributes=c("ensembl_gene_id","hgnc_symbol","chromosome_name","start_position","end_position"),
# filters=c("chromosome_name", "start", "end"),
# values=list(chromosome=chr, start=pos, end=pos),
# mart=biomart)
#
# all.genes %>%
# filter(!hgnc_symbol=="") -> all.genes
#
# if(nrow(all.genes)==1){
# my_gene = all.genes
#
# ## what if several ones?
# } else if(nrow(all.genes)>=1){
# # get the closest?
# all.genes %>%
# mutate(distance=pmin(abs(start_position-pos), abs(end_position-pos))) %>%
# arrange(distance)-> all.genes
# my_gene = all.genes[1,]
# ## what if no gene exactly here?
# } else if(nrow(all.genes)==0){
# # look at the ones in a 500kb windows
# all.genes <- biomaRt::getBM(
# attributes=c("ensembl_gene_id","hgnc_symbol","chromosome_name","start_position","end_position"),
# filters=c("chromosome_name", "start", "end"),
# values=list(chromosome=chr, start=pos-500000, end=pos+500000),
# mart=biomart)
# # get the closest?
# all.genes %>%
# filter(!hgnc_symbol=="") %>%
# mutate(distance=pmin(abs(start_position-pos), abs(end_position-pos))) %>%
# arrange(distance)-> all.genes
# # get the most relevant one(s)
# all.genes %>%
# filter(!hgnc_symbol=="") %>%
# mutate(distance=pmin(abs(start_position-pos), abs(end_position-pos))) %>%
# arrange(distance)-> all.genes
# my_gene = all.genes[1,]
# }
#
# return(my_gene$hgnc_symbol)
#
# }
#### GET GENES NAMES USING ANNOVAR ####
# get_geneInfo returns c(function, gene, distance)
get_geneInfo <- function(chr, pos, ref="0", alt="0"){
init.dir = getwd()
# Set needed directory (annovar needs to be installed somewhere)
annovar.dir <- "~/bin/annovar"
if(!dir.exists(annovar.dir)) suppressMessages(BioInstaller::install.bioinfo('annovar', annovar.dir))
setwd(annovar.dir)
if(!file.exists("humandb/hg19_refGene.txt")) suppressMessages(system("perl annotate_variation.pl -buildver hg19 -downdb -webfrom annovar refGene humandb/"))
# since everyhting is done in the annovar/ folder
# creates an unique ID to be able to perform simultaneous analysis
# without problems of "same name" files
ID = sample(1:100000, 1)
write.table(data.frame(chr, pos, pos, ref, alt), paste0("mydata_", ID), sep="\t", row.names=F, col.names=F, quote=F)
#The gene-based annotation can be issued by the following command (by default, --geneanno -dbtype refGene is assumed):
# [kaiwang@biocluster ~/]$ annotate_variation.pl -out ex1 -build hg19 example/ex1.avinput humandb/
suppressMessages(system(paste0("perl annotate_variation.pl -out myres_", ID,
" -build hg19 mydata_", ID, " humandb/"),
intern = F, ignore.stdout=T, ignore.stderr=T))
# creates myres.variant_function, myres.exonic_variant_function and myres.log / remove them afterwards
data.table::fread(paste0("myres_", ID, ".variant_function"), data.table = F) %>%
dplyr::select(c(1:2)) %>%
setNames(c("Function", "Gene")) -> res
# if intronic/exonic ... only gene name reported, ok
if(res$Function %in% c("intronic", "exonic", "ncRNA_intronic")){
res %>%
mutate(Distance=0) -> res
# if UTR / splicing, need to clean gene name
} else if(res$Function %in% c("UTR3", "UTR5", "splicing")){
my_gene = strsplit(res$Gene, "(", fixed=T)[[1]][1]
res %>%
mutate(Gene=my_gene,
Distance=0) -> res
# if downstream / upstream
} else if(res$Function %in% c("downstream", "upstream")){
my_gene = strsplit(res$Gene, "(", fixed=T)[[1]][1]
my_distance = gsub('.{1}$', '', strsplit(res$Gene, "dist=", fixed=T)[[1]][2])
res %>%
mutate(Gene=my_gene,
Distance=my_distance) -> res
}else if(res$Function == "intergenic"){
# returns both (can be more than 2?)
my_gene1 = strsplit(res$Gene, "(", fixed=T)[[1]][1]
my_distance1 = strsplit(gsub('.{1}$', '', strsplit(res$Gene, "dist=", fixed=T)[[1]][2]), ")", fixed=T)[[1]][1]
part2 = strsplit(res$Gene, ",", fixed=T)[[1]][2]
my_gene2 = strsplit(part2, "(", fixed=T)[[1]][1]
my_distance2 = strsplit(gsub('.{1}$', '', strsplit(part2, "dist=", fixed=T)[[1]][2]), ")", fixed=T)[[1]][1]
res %>%
mutate(Gene=paste(my_gene1, my_gene2, sep="/"),
Distance=paste(my_distance1, my_distance2, sep="/")) -> res
}
system(paste0("rm myres_", ID, ".* mydata_", ID))
setwd(init.dir)
return(res)
}
# # exonic
# chr = 19
# pos = 45411941
# snp= "rs429358"
# ref = "T"
# alt= "C"
# get_geneInfo(chr, pos, alt, ref)
#
# # intergenic
# chr = 6
# pos = 98322872
# snp= "rs4580876"
# ref = "G"
# alt= "A"
# get_geneInfo(chr, pos, alt, ref)
#
# # UTR3
# chr = 16
# pos = 4013467
# snp= "rs2531995"
# ref = "C"
# alt= "T"
# get_geneInfo(chr, pos, alt, ref)
#
# # UTR5
# chr = 1
# pos = 948921
# snp = "rs15842"
# alt = "T"
# red = "C"
# get_geneInfo(chr, pos, alt, ref)
#
# # downstream
# chr = 7
# pos = 75162278
# snp = "rs62477737"
# ref = "G"
# alt = "A"
# get_geneInfo(chr, pos, alt, ref)
#
# # intronic
# chr = 7
# pos = 75094329
# snp = "rs113160991"
# alt = "A"
# get_geneInfo(chr, pos, alt)
#
# # splicing
# chr = 1
# pos = 5935162
# snp = "rs1287637"
# ref = "A"
# alt = "T"
# get_geneInfo(chr, pos, alt, ref)
#
#
# #exonic : ok
# #splicing : ok
# #ncRNA : ?? variant overlaps a transcript without coding annotation in the gene definition (see Notes below for more explanation) non_coding_transcript_variant (SO:0001619)
# # ncRNA_intronic
# #UTR5 : ok
# #UTR3 : ok
# #intronic : ok
# #upstream : potentially two genes?
# #downstream : potentially two genes?
# # downstream,upstream combined?
# #intergenic : more that two genes?
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.