explore/rs4575098/rs4575098.R

# rs4575098,
# 0.98 LD with 	rs11585858
#   chrom      hg19      hg38       rsid
# 1     1 161156033 161186243 rs11585858
# allegedly associated with  PPOX,APOA2,ADAMTS4,B4GALT3,USP21,NDUFS2,TOMM40
# 1:161185602 (GRCh38)
# 1:161155392 (GRCh37)
#
# not reported by posthuma
# from williams:
#   ADAMTS4 rs4575098; ref. 31
#   Combined UK Biobank, ADSP, IGAP, PGC–ALZ and deCODE
#   Extracellular matrix metalloproteinase (aggrecanase-1)
#
# Jansen, I. E. et al. Genome-wide meta-analysis identifies new loci and
# functional pathways influencing Alzheimer’s disease risk. Nat. Genet. 51,
# 404–413 (2019).  29 loci in total, implicating 215 potential causative genes
# here is our tag snp
#
#          region                     case-control status(phase 1)        AD-by-proxy (phase 2)                 overall (phase3)
#   locus    chr  loc         gene          snp     p                             SNP    p               A1 A2 MAF    Z       p
#    1       chr1 161155392  ADAMTS4      rs457098  1.57e-04                  rs457098   6.88e-08         A  G 0.240 6.36  2.05E-10
#
# Here, we performed a large genome-wide association study of clinically diagnosed AD and
# AD-by-proxy (71,880 cases, 383,378 controls). AD-by- proxy, based on parental diagnoses, showed
# strong genetic correlation with AD (rg=0.81). Meta- analysis identified 29 risk loci, implicating
# 215 potential causative genes. Associated genes are strongly expressed in immune-related tissues
# and cell types (spleen, liver and microglia). Gene-set analyses indicate biological mechanisms
# involved in lipid-related processes and degradation of amyloid precursor proteins. We show strong
# genetic correlations with multiple health-related outcomes, and Mendelian randomisation results
# suggest a protective effect of cognitive ability on AD risk. These results are a step forward in
# identifying the genetic factors that contribute to AD risk and add novel insights into the
# neurobiology of AD.
# new table here, once published:
#
# Bellenguez C, Küçükali F, Jansen I, et al. New insights on the genetic etiology of Alzheimer’s
# and related dementia. medRxiv; 2020. DOI: 10.1101/2020.10.01.20200659.

#------------------------------------------------------------------------------------------------------------------------
library(EndophenotypeExplorer)
library(RPostgreSQL)
library(plyr)
library(ghdb)
#------------------------------------------------------------------------------------------------------------------------
ghdb <- GeneHancerDB()
tag.snp <- "rs4575098"
ld.snp  <- "rs11585858"
#------------------------------------------------------------------------------------------------------------------------
display.snps <- fuunction()
{
   igv <- start.igv("ADAMTS4", "hg38")

      #------------------------------------------------------------
      # first: a track with rsids, visible when clicked upon
      #------------------------------------------------------------

   tbl.hap <- read.table("haploreg-rs4575098-0.2.tsv", sep="\t", as.is=TRUE, header=TRUE, nrow=-1)
   tbl.track <- tbl.hap[, c("chrom", "hg38", "hg38", "rsid")]
   colnames(tbl.track)[c(2,3)] <- c("start", "end")
   tbl.track$start <- tbl.track$start - 1
   track <- DataFrameAnnotationTrack("hapName", tbl.track, color="brown")
   displayTrack(igv, track)
   shoulder <- 5000
   roi <- sprintf("%s:%d-%d", tbl.track$chrom[1], min(tbl.track$start)-shoulder, max(tbl.track$end)+shoulder)
   showGenomicRegion(igv, roi)

      #------------------------------------------------------------
      # now a track with rSquared magnitudes
      #------------------------------------------------------------

   tbl.track <- tbl.hap[, c("chrom", "hg38", "hg38", "rSquared")]
   colnames(tbl.track)[c(2,3)] <- c("start", "end")
   tbl.track$start <- tbl.track$start - 1
   track <- DataFrameQuantitativeTrack("hapScore", tbl.track, color="darkRed", autoscale=FALSE, min=0, max=1)
   displayTrack(igv, track)

} # display.snps
#------------------------------------------------------------------------------------------------------------------------
get.eqtls <- function()
{
    suppressWarnings(
        db.access.test <- try(system("/sbin/ping -c 1 khaleesi", intern=TRUE, ignore.stderr=TRUE))
        )
    if(length(db.access.test) == 0)
        stop("khaleesi database server unavailable")

    db <- dbConnect(PostgreSQL(), user="trena", password="trena", dbname="genereg2021", host="khaleesi")
    dbGetQuery(db, "select * from eqtls limit 3")
    getGenomicRegion(igv)

    roi <- getGenomicRegion(igv)
    printf("%d", with(roi, end-start))
    query <- with(roi, sprintf("select * from eqtls where chrom='%s' and hg38 > %d and hg38 < %d",
                               chrom, start, end))
    tbl.eqtl <- dbGetQuery(db, query)
    dim(tbl.eqtl)
    tbl.eqtl <- subset(tbl.eqtl, pvalue < 0.05)
    dim(tbl.eqtl)
    tbl.eqtl
    tbl.eqtl.hg38 <- unique(tbl.eqtl[, c("chrom", "hg38", "rsid", "genesymbol", "pvalue", "study", "tissue")])

    target.genes <- sort(unique(tbl.eqtl.hg38$genesymbol))
    deleters <- which(nchar(target.genes) == 0)
    deleters
    if(length(deleters) > 0){
        printf("deleting %d empty string gene names", length(deleters))
        target.genes <- target.genes[-deleters]
        }
    length(target.genes)

    pval.max <- 1e-5

    for(gene in target.genes){
       tbl.track <- subset(tbl.eqtl, genesymbol==gene & pvalue < pval.max)
       if(nrow(tbl.track) == 0) next;
       tbl.track$end <- tbl.track$hg38
       tbl.track$start <- tbl.track$hg38 -1
       tbl.track <- tbl.track[, c("chrom", "start", "end", "rsid")]
       track <- DataFrameAnnotationTrack(gene, tbl.track, color="random", trackHeight=25)
       displayTrack(igv, track)
       } # for target.gene

    genes.assoc <- c("PPOX","APOA2","ADAMTS4","B4GALT3","USP21","NDUFS2","TOMM40L")
    intersect(target.genes, genes.assoc)

    tbl.eqtls.oi <- subset(tbl.eqtl.hg38, pvalue <= 1e-6)  # NDUFS2 PCP4L1 TSTD1
    new.order <- order(tbl.eqtls.oi$rsid)
    tbl.eqtls.oi <- tbl.eqtls.oi[new.order,]
    dim(tbl.eqtls.oi)  # 32 7
    save(tbl.eqtls.oi, file="AD-eqtls-near-rs4575098-threshold-1e-6.RData")

    tbl.eqtls.oi <- subset(tbl.eqtl.hg38, pvalue <= 1e-5)  # F11R NDUFS2 PCP4L1 PPOX  TSTD1
    new.order <- order(tbl.eqtls.oi$rsid)
    tbl.eqtls.oi <- tbl.eqtls.oi[new.order,]
    dim(tbl.eqtls.oi)  # 90 7
    save(tbl.eqtls.oi, file="AD-eqtls-near-rs4575098-threshold-1e-5.RData")

    tbl.eqtls.oi <- subset(tbl.eqtl.hg38, pvalue <= 1e-4)  # F11R NDUFS2 PCP4L1 PPOX  TSTD1
    new.order <- order(tbl.eqtls.oi$rsid)
    tbl.eqtls.oi <- tbl.eqtls.oi[new.order,]
    dim(tbl.eqtls.oi)  # 117 7
    save(tbl.eqtls.oi, file="AD-eqtls-near-rs4575098-threshold-1e-4.RData")

      # next up on khaleesi: build tissue/study specific trena models for each set of 3 then 5 genes
      # break motifs for all rsids

} # get.eqtls
#------------------------------------------------------------------------------------------------------------------------
#
# tbl.williams <- get(load(system.file(package="EndophenotypeExplorer", "extdata", "gwas",
#                                      "williams-natureNeuroscience2020.RData")))
# dim(tbl.williams)
# rsids.williams <- unique(grep("^rs", unlist(strsplit(tbl.williams$rsid, ",")), value=TRUE))
# tag.snp %in% rsids.williams
# tbl.posthuma <- get(load(system.file(package="EndophenotypeExplorer", "extdata", "gwas",
#                                      "tbl.posthuma-38-geneAssociations-curated-3828x12.RData")))
# dim(tbl.posthuma)
# tag.snp %in% tbl.posthuma$rsid
#
# subset(tbl.williams, rsid==tag.snp)
# #    locusOrGene      rsid
# # 54     ADAMTS4 rs4575098
#
# igv <- start.igv("ADAMTS4", "hg19")
# zoomOut(igv)
#
# etx <- EndophenotypeExplorer$new("ADAMTS4", "hg19")
#
# tbl.loc <- etx$rsidToLoc(tag.snp)
#    #   chrom      hg19      hg38      rsid
#    #       1 161155392 161185602 rs4575098
#
# tbl.track <- with(tbl.loc, data.frame(chrom=chrom, start=hg19-1, end=hg19, stringsAsFactors=FALSE))
# track <- DataFrameAnnotationTrack(tag.snp, tbl.track, color="red", trackHeight=25)
# displayTrack(igv, track)
#
# #   chrom      hg19      hg38       rsid
# # 1     1 161156033 161186243 rs11585858
# tbl.track <- data.frame(chrom="1", start=161156033-1, end=161156033, stringsAsFactors=FALSE)
# track <- DataFrameAnnotationTrack(ld.snp, tbl.track, color="red", trackHeight=25)
# displayTrack(igv, track)
#
#
#
# tbl.all <- retrieveEnhancersFromDatabase(ghdb, target.gene, tissues="all")
# dim(tbl.all) # 71 16
# tbl.hg19 <- to.hg19(ghdb, tbl.all)
# dim(tbl.hg19)
# tbl.track <- tbl.hg19[, c("chrom", "start", "end", "combinedscore")]
# colnames(tbl.track)[4] <- "score"
# tbl.track$score <- asinh(tbl.track$score)
# track <- DataFrameQuantitativeTrack("ADAMST4-gh", tbl.track, color="blue", autoscale=TRUE)
# #displayTrack(igv, track)
#
#
# # hg38:  chr1, 161181277 161188841
# tbl.ghByRegion <- queryByRegion(ghdb, chrom="1", start=161181277, end=161188841)
# dim(tbl.ghByRegion)
# tbl.ghByRegion
#
# genes.other <- unique(tbl.ghByRegion$gene)
# for(gene in genes.other){
#    tbl.gh <- retrieveEnhancersFromDatabase(ghdb, gene, tissues="all")
#    tbl.hg19 <- to.hg19(ghdb, tbl.gh)
#    tbl.track <- tbl.hg19[, c("chrom", "start", "end", "combinedscore")]
#    colnames(tbl.track)[4] <- "score"
#    tbl.track$score <- asinh(tbl.track$score)
#    trackName <- sprintf("%s-gh", gene)
#    track <- DataFrameQuantitativeTrack(trackName, tbl.track, color="random", autoscale=FALSE,
#                                        min=0, max=10)
#    displayTrack(igv, track)
#    }
#
#
# #----------------------------------------------------------------------------------------------------
# explore.variants <- function()
# {
#    showGenomicRegion(igv, "chr1:161,155,310-161,156,114")  # tag & ld snp, with one inbetween
#      # maybe adjust manually, then:
#    roi.igv <- getGenomicRegion(igv)
#    roi.igv$chrom <- sub("chr", "", roi.igv$chrom)
#    #roi <- with(roi.igv, GRanges(seqnames=chrom, IRanges(start=start, end=end)))
#    #url <- "https://igv-data.systemsbiology.net/static/ampad/NIA-1898/chr1.vcf.gz"
#
#    mtx.geno <- with(roi.igv, etx$getGenoMatrix(chrom, start, end))
#    dim(mtx.geno)
#    rsid <- etx$locsToRSID(rownames(mtx.geno), "hg19")
#    rownames(mtx.geno) <- rsid
#    rownames(mtx.geno)
#
#    non.wt <- apply(mtx.geno, 1, function(row) length(which(row != "0/0")))
#    tbls <- list()
#    rsids.legit <- grep("^rs", names(non.wt), value=TRUE)
#    for(rsid in rsids.legit){
#       printf("--- rsid: %s", rsid)
#       tbls[[rsid]] <- etx$getAggregatedAlleleFrequencies(rsid)
#       }
#    tbl <- do.call(rbind.fill, tbls)
#    tbl.euro <- subset(tbl, population=="European")
#
#    tbl.euro
#        #          rsid ref population     A      G  total      A.freq    G.freq     min.freq     C    C.freq    T   T.freq
#       ##13   rs4575098   G   European 40918 129876 170794 23.95751607  76.04248  23.95751607    NA        NA   NA       NA
#       ##73  rs11585858   C   European 27919      0 121662 22.94800349   0.00000  22.94800349 93743  77.05200   NA       NA
#       ##49  rs60677420   C   European    NA     NA  24808          NA        NA   7.20735247 23020  92.79265 1788 7.207352
#
#       # 1  rs774377891  G   European    13  14273  14286  0.09099818  99.90900   0.09099818    NA        NA   NA       NA
#       #25 rs140964134   C   European     2      0  14266  0.01401935   0.00000   0.01401935 14264  99.98598   NA       NA
#       #37 rs181507083   G   European     0   9616   9616  0.00000000 100.00000 100.00000000    NA        NA    0 0.000000
#       #61 rs764488194   C   European    NA     NA   9690          NA        NA 100.00000000  9690 100.00000    0 0.000000
#
#     table(mtx.geno["rs4575098",])
#       #  0/0  0/1  1/1      40%
#       # 1121  682   91
#     table(mtx.geno["rs11585858",])
#       #  0/0  0/1  1/1      40%
#       # 1120  681   93
#     table(mtx.geno["rs60677420", ])
#       #  0/0  0/1  1/1
#       # 1600  280   14      15%
#
#     samples.mutant.rs4575098 <- colnames(mtx.geno)[which(mtx.geno["rs4575098",] != "0/0")]
#     samples.mutant.rs11585858 <- colnames(mtx.geno)[which(mtx.geno["rs11585858",] != "0/0")]
#     samples.mutant.rs60677420 <- colnames(mtx.geno)[which(mtx.geno["rs60677420",] != "0/0")]
#     length(samples.mutant.rs4575098)    # 773
#     length(samples.mutant.rs11585858)   # 774
#     length(samples.mutant.rs60677420)   # 294
#
#     length(intersect(samples.mutant.rs4575098, samples.mutant.rs11585858))   # 768
#     length(intersect(samples.mutant.rs60677420, samples.mutant.rs4575098))   # 55
#
#      # up next: are these all really european?  all path aging?  all with the variant
#      # shown above, at 23% euro frequency? map to patient, examine ethnicity
#      # more flexible rownames look up for mtx.geno.  they originally had the substitution
#      # is that preserved in the rsid, or covered over when there are 3 or 4 alleles?
#
#      # a small case to start, the 14 homozygous samples for rs60677420
#      soi <- "rs4575098"
#      soi <- "rs11585858"
#      soi <- "rs60677420"
#      vcf.ids <- names(which(mtx.geno[soi,] != "0/0"))
#      length(vcf.ids)
#      tbl.map <- etx$getIdMap()
#      stopifnot(all(vcf.ids %in% tbl.map$vcf))
#      tbls.clinical <- lapply(vcf.ids, function(id) etx$vcfSampleID.to.clinicalTable(id))
#      tbl.clinical <- do.call(rbind, tbls.clinical)
#      mean(tbl.clinical$cogdx, na.rm=TRUE) # [1] 2.627957
#      mean(tbl.clinical$braak, na.rm=TRUE) # [1] 3.779539
#      mean(tbl.clinical$cerad, na.rm=TRUE) # [1] 2.207705
#
#        #---------------------------------
#        # stats for all samples
#        #---------------------------------
#      tbls.clinical <- lapply(colnames(mtx.geno), function(id) etx$vcfSampleID.to.clinicalTable(id))
#      tbl.clinical <- do.call(rbind, tbls.clinical)
#      mean(tbl.clinical$cogdx, na.rm=TRUE) # [1] 2.653913
#      mean(tbl.clinical$braak, na.rm=TRUE) # [1] 3.65516
#      mean(tbl.clinical$cerad, na.rm=TRUE) # [1] 2.204013
#
#
#        #--------------------------------------------------
#        # get mtx.ad
#        #--------------------------------------------------
#      samples.ad <- subset(tbl.clinical, cogdx >= 4)$sampleID
#      length(samples.ad)  # 498
#      snps.oi <- c("rs4575098", "rs11585858", "rs60677420")
#      mtx.ad <- mtx.geno[snps.oi, samples.ad]
#      dim(mtx.ad)   # 3 498
#      table(mtx.ad[1,])
#      table(mtx.ad[2,])
#      table(mtx.ad[3,])
#
#        #----------------------------------------
#        # pvalue of 098 in
#        #----------------------------------------
#
#    studyPop.rs4575098 <- rep(0, ncol(mtx.ad))
#    studyPop.rs4575098[which(mtx.ad[1,] != "0/0")] <- 1
#    table(studyPop.rs4575098)
#
#    euroPop.all <- rep(0, 170794)
#    euroPop.all[sample(seq_len(length(euroPop.all)), 40918)] <- 1
#    table(euroPop.all)
#    t.test(studyPop.rs4575098, euroPop.all)$p.value  # [1] 4.178421e-12
#
# } # explore.variants
# #----------------------------------------------------------------------------------------------------
# # cribbed from https://github.com/KatrionaGoldmann/omicAnnotations/blob/main/R/gtex_eqtl.R
# # see also: https://smart-api.info/ui/8b3389ee427f89a358e157319a9db534#/expression/geneExpression
# # gene id mapping: https://cran.r-project.org/web/packages/grex/vignettes/grex.html
# # Why does GTEx use TPM units rather than RPKM?
# # We no longer provide expression quantifications in RPKM. TPM is a better unit for comparison of
# # RNA-seq samples. You can convert RPKM as follows (for each sample/column):
# # TPM = RPKM / sum(RPKM) * 1e6. For additional information,
# # please see: https://academic.oup.com/bioinformatics/article/26/4/493/243395
# gtex.demo <- function()
# {
#    library(httr)
#    library(grex); data(gtexv7)
#    url <-  paste0("https://gtexportal.org/rest/v1/reference/gene?geneId=", 10,
#                   "&gencodeVersion=v26&genomeBuild=GRCh38%2Fhg38&pageSize=250",
#                   "&format=json")
#    x <- GET(url)
#    content(x)
#
#    url.0 <- "https:///v1/expression/geneExpression?"
#    url.1 <- "datasetId=gtex_v8&gencodeId=ENSG00000065613.9"
#    url.2 <- "&tissueSiteDetailId=Brain_Cortex&format=json"
#
#    url <- sprintf("%s%s%s", url.0, url.1, url.2)
#    x <- GET(url)
#    content(x)
#    url <- https://gtexportal.org/rest/v1/expression/geneExpression?datasetId=gtex_v8&gencodeId=ENSG00000065613.9&tissueSiteDetailId=Brain_Cortex&format=json
#
#    tissues <- c('Adipose_Subcutaneous', 'Adipose_Visceral_Omentum',
#                 'Adrenal_Gland', 'Artery_Aorta', 'Artery_Coronary',
#                 'Artery_Tibial', 'Bladder', 'Brain_Amygdala',
#                 'Brain_Anterior_cingulate_cortex_BA24',
#                 'Brain_Caudate_basal_ganglia',
#                 'Brain_Cerebellar_Hemisphere', 'Brain_Cerebellum',
#                 'Brain_Cortex', 'Brain_Frontal_Cortex_BA9',
#                 'Brain_Hippocampus', 'Brain_Hypothalamus',
#                 'Brain_Nucleus_accumbens_basal_ganglia',
#                 'Brain_Putamen_basal_ganglia',
#                 'Brain_Spinal_cord_cervical_c-1',
#                 'Brain_Substantia_nigra', 'Breast_Mammary_Tissue',
#                 'Cells_EBV-transformed_lymphocytes',
#                 'Cells_Cultured_fibroblasts',
#                 'Cells_Transformed_fibroblasts', 'Cervix_Ectocervix',
#                 'Cervix_Endocervix', 'Colon_Sigmoid',
#                 'Colon_Transverse',
#                 'Esophagus_Gastroesophageal_Junction',
#                 'Esophagus_Mucosa', 'Esophagus_Muscularis',
#                 'Fallopian_Tube', 'Heart_Atrial_Appendage',
#                 'Heart_Left_Ventricle', 'Kidney_Cortex',
#                 'Kidney_Medulla', 'Liver', 'Lung',
#                 'Minor_Salivary_Gland', 'Muscle_Skeletal',
#                 'Nerve_Tibial', 'Ovary', 'Pancreas', 'Pituitary',
#                 'Prostate', 'Skin_Not_Sun_Exposed_Suprapubic',
#                 'Skin_Sun_Exposed_Lower_leg',
#                 'Small_Intestine_Terminal_Ileum', 'Spleen', 'Stomach',
#                 'Testis', 'Thyroid', 'Uterus', 'Vagina',
#                 'Whole_Blood')
#
#    gencode.gr <- import("gencode.v26.GRCh38.genes.gtf")
#    tbl.gencode <- as.data.frame(gencode.gr)
#
#    brain.tissues <- grep("Brain", tissues, value=TRUE)
#    brain.tissue.string <- paste(brain.tissues, collapse=",")
#    gata2 <- "ENSG00000179348.11"
#    fcer1g <- "ENSG00000158869.10"
#
#    geneSymbol <- "ADAMTS4"
#    geneSymbol <- "FCER1G"
#    gencode.id <- subset(tbl.gencode, type=="gene" & transcript_name==geneSymbol)$gene_id
#
#    url <- "https://gtexportal.org/rest/v1/expression/geneExpression?datasetId=gtex_v7&gencodeId=ENSG00000135100.13&tissueSiteDetailId=Pancreas&format=json"
#    GET(url)
#
#    url.0 <- "https://gtexportal.org/rest/v1/expression/geneExpression"
#    url.0 <- "https://gtexportal.org/rest/v1/expression/medianGeneExpression"
#    url.1 <- "datasetId=gtex_v7"
#    url.1 <- "datasetId=gtex_v8"
#    url.2 <- sprintf("gencodeId=%s", gata2)
#    url.2 <- sprintf("gencodeId=%s", fcer1g)
#    url.2 <- sprintf("gencodeId=%s", gencode.id)
#    #url.2 <- sprintf("gencodeId=%s", "ENSG00000135100")
#    #url.3 <- "tissueSiteDetailId=Pancreas,Brain_Cerebellum"
#    url.3 <- sprintf("tissueSiteDetailId=%s", brain.tissue.string)
#    url.4 <- "format=json"
#    url <- sprintf("%s?%s&%s&%s&%s", url.0, url.1, url.2, url.3, url.4)
#    x <- content(GET(url))
#    tbls <- lapply(x[[1]], as.data.frame)
#    tbl <- do.call(rbind, tbls)
#    tbl
#
# } # gtex.demo
# #----------------------------------------------------------------------------------------------------
#
#
PriceLab/TrenaProjectAD documentation built on July 11, 2022, 3:42 a.m.