studies/sec61g-egfr/egfr.R

# get gtex tissue names:
#    tbl.summary <- gwex$getEQTLSummary()
#    checkTrue(nrow(tbl.summary) > 450)
#    brain.tissue.study.ids <- grep("GTEx_V8.Brain_[CH]", tbl.summary$unique_id, v=TRUE)
#----------------------------------------------------------------------------------------------------
library(gwasExplorer)
library(EndophenotypeExplorer)
library(RUnit)
library(GenomicRanges)
#----------------------------------------------------------------------------------------------------
eqtl.list <- get(load("eqtl.tbls.RData"))
names(eqtl.list)
brain.studies <- sprintf("GTEx_V8.Brain_%s", names(eqtl.list))

genomic.shoulder <- 1000
targetGene <- "EGFR"
tag.snp <- "rs76928645"
tag.snp.hg19 <- 54941328
tag.snp.hg38 <- 54873635
tag.snp.chrom <- "chr7"

if(!exists("tbl.linkage"))
  tbl.linkage <- get(load("tbl.linkage.hg19.hg38.RData"))

if(!exists("tbl.fimo"))
    tbl.fimo <- get(load("tbl.fimo.EGFR.RData"))

#----------------------------------------------------------------------------------------------------
specify.region <- function()
{
   require(rtracklayer)
    # the haploreg area, also has high eqtls in gtex cortex:  chr7:54,828,467-54,949,551
    start <- 54828467
    end   <- 54949551
    # a 4300 bp region, should find two TEAD1 broken motifs and 1 EMX2
    #  start <- 55019572
    #  end <- 55023954

   gr.hg38 <- GRanges(seqnames="chr7", IRanges(start=start, end=end))

    if(!file.exists("hg38ToHg19.over.chain")){
       system("curl -O http://hgdownload.soe.ucsc.edu/goldenPath/hg38/liftOver/hg38ToHg19.over.chain.gz")
       system("gunzip hg38ToHg19.over.chain.gz")
       }
    chain <- import.chain("hg38ToHg19.over.chain")
    x <- liftOver(gr.hg38, chain)
    gr.hg19 <- unlist(x)
    tbl.roi <- data.frame(chrom="chr7",
                          start.hg19=as.data.frame(gr.hg19)$start[1],
                          end.hg19=as.data.frame(gr.hg19)$end[1],
                          start.hg38=start,
                          end.hg38=end,
                          width=1+end-start,
                          stringsAsFactors=FALSE)
    tbl.roi

} # specify.region
#----------------------------------------------------------------------------------------------------
break.motifs <- function(rsids, motif.names)
{
    message(sprintf("--- break.motifs: %d rsids, %d motif.names", length(rsids), length(motif.names)))

    require(motifbreakR)
    require(BiocParallel)
    require(SNPlocs.Hsapiens.dbSNP155.GRCh38)
    require(BSgenome.Hsapiens.UCSC.hg38)

    motifs.selected <- MotifDb[motif.names]
    print(system.time({
       snps.gr.list <- lapply(rsids,
                           function(rsid) snps.from.rsid(rsid = rsid,
                                                         dbSNP=SNPlocs.Hsapiens.dbSNP155.GRCh38,
                                                         search.genome=BSgenome.Hsapiens.UCSC.hg38))
       snps.gr <- unlist(as(snps.gr.list, "GRangesList"))
       }))

    bpparam <- SerialParam()
    print(system.time(results <- motifbreakR(snpList = snps.gr,
                           filterp = TRUE,
                           pwmList = motifs.selected,
                           show.neutral=FALSE,
                           method = c("ic", "log", "notrans")[1],
                           bkg = c(A=0.25, C=0.25, G=0.25, T=0.25),
                           BPPARAM = bpparam,
                           verbose=TRUE)))
    tbl.out <- as.data.frame(results, row.names=NULL)
    if(nrow(tbl.out) == 0)
        return(data.frame())
    tbl.out <- subset(tbl.out, effect=="strong")
    tbl.out$pctDelta <- with(tbl.out, pctRef - pctAlt)
    tbl.out

} # break.motifs
#----------------------------------------------------------------------------------------------------
test_break.motifs <- function()
{
    motif.names <- c("Hsapiens-HOCOMOCOv11-core-A-RXRA_HUMAN.H11MO.0.A",
                     "Hsapiens-jaspar2018-HNF4G-MA0484.1")

    rsids <- c("rs6979446")
    rsids <- c("rs6979446", "rs73410456")
    tbl.breaks <- break.motifs(rsids, motif.names)
    checkEquals(dim(tbl.breaks), c(2, 26))
    checkEqualsNumeric(tbl.breaks$pctDelta, c(0.1510949, 0.1809402), tol=1e-4)

    big.test <- FALSE
    if(big.test){
        rsids.20 <- c("rs149352678", "rs74504435", "rs77126132", "rs151057105", "rs75061358",
                      "rs76928645", "rs80013346", "rs11765408", "rs142461330", "rs6979446",
                      "rs6951828", "rs115204793", "rs115849102", "rs142950337", "rs58626582",
                      "rs59010780", "rs60596612", "rs61022057", "rs6975304", "rs77148056")
        motif.names.20 <- c("Hsapiens-HOCOMOCOv11-core-A-TYY1_HUMAN.H11MO.0.A",
                            "Hsapiens-jaspar2018-YY1-MA0095.2",
                            "Hsapiens-HOCOMOCOv11-core-A-ZFP42_HUMAN.H11MO.0.A",
                            "Hsapiens-jaspar2018-GSX2-MA0893.1",
                            "Hsapiens-HOCOMOCOv11-core-A-HXB13_HUMAN.H11MO.0.A",
                            "Hsapiens-jaspar2018-MNX1-MA0707.1",
                            "Hsapiens-jaspar2018-HOXA10-MA0899.1",
                            "Hsapiens-HOCOMOCOv11-core-A-MEF2D_HUMAN.H11MO.0.A",
                            "Hsapiens-jaspar2018-CDX1-MA0878.1",
                            "Hsapiens-jaspar2018-HOXA13-MA0650.1",
                            "Hsapiens-jaspar2018-HOXB13-MA0901.1",
                            "Hsapiens-jaspar2018-HOXD13-MA0909.1",
                            "Hsapiens-jaspar2018-MEF2B-MA0660.1",
                            "Hsapiens-HOCOMOCOv11-core-A-TBP_HUMAN.H11MO.0.A",
                            "Hsapiens-jaspar2018-ZNF263-MA0528.1",
                            "Hsapiens-HOCOMOCOv11-core-A-RXRA_HUMAN.H11MO.0.A",
                            "Hsapiens-HOCOMOCOv11-core-A-KLF15_HUMAN.H11MO.0.A",
                            "Hsapiens-jaspar2018-TCF7L1-MA1421.1",
                            "Hsapiens-jaspar2018-RXRG-MA0856.1",
                            "Hsapiens-jaspar2018-HNF4G-MA0484.1")

        t0 <- system.time(
            tbl.breaks.2020 <- break.motifs(rsids.20, motif.names.20)
            )
            #    user  system elapsed
            #  60.409  19.815  80.362
        checkTrue(nrow(tbl.breaks) > 100)
        } # big.test

} # test_break.motifs
#----------------------------------------------------------------------------------------------------
# ampad eqtls apparently have no beta, just pval.
# every trena tf is scored as the sum of the break score, for every variant judged to break
# its motif, * -log10 of the eqtls pvalue
score.breakage <- function(tbl.trena, tbl.ampad.eqtls, tbl.breaks)
{
   tfs.oi <- tbl.trena$gene

   if(!"sig.no.beta" %in% colnames(tbl.ampad.eqtls)){
       sig.no.beta <- with(tbl.ampad.eqtls, -log10(pvalue)) # * abs(beta))
       tbl.ampad.eqtls$sig.no.beta <- sig.no.beta
       }

   breakage.scores <- list()

   for(tf in tfs.oi){
      #tbl.tf.breaks.sub <- subset(tbl.breaks, geneSymbol == tf & SNP_id %in% tbl.eqtl$rsid)
      tbl.tf.breaks.sub <- subset(tbl.breaks, geneSymbol == tf & SNP_id %in% tbl.ampad.eqtls$rsid)
      dups <- which(duplicated(tbl.tf.breaks.sub[, c("SNP_id", "geneSymbol")]))
      if(length(dups) > 0)
          tbl.tf.breaks.sub <- tbl.tf.breaks.sub[-dups,]
      tf.breakage.score <- 0
      for(breaking.snp in tbl.tf.breaks.sub$SNP_id){
          pctDelta <- abs(subset(tbl.tf.breaks.sub, SNP_id==breaking.snp)$pctDelta)
              # multiply this by the tf's rfNorm and the eQTLs sig.no.beta
          sig.no.beta <- max(subset(tbl.ampad.eqtls, rsid==breaking.snp)$sig.no.beta)
          tf.rfNorm <- subset(tbl.trena, gene==tf)$rfNorm
          new.increment <- (pctDelta + sig.no.beta) * tf.rfNorm
             # printf("%20s: %f", breaking.snp, new.increment)
          tf.breakage.score <- tf.breakage.score + new.increment
          } # for breaking.snp
      # printf("--- %s: %5.2f", tf, tf.breakage.score)
      breakage.scores[[tf]] <- tf.breakage.score
      } # for tf

    tbl.trena$breakage.score <- round(as.numeric(breakage.scores), digits=2)
    tbl.trena

} # score.breakage
#----------------------------------------------------------------------------------------------------
main <- function()
{
    for(brain.tissue in brain.studies){
        output.filename <- sprintf ('gwex.%s.%s.%s.RData', targetGene, brain.tissue,
                                    format (Sys.time(), "%a.%b.%d.%Y-%H:%M:%S"))
        printf("--- starting on %s", output.filename )
        gwex <- gwasExplorer$new(targetGene=targetGene, locusName="SEC61G", tagSnp=tag.snp,
                                 shoulder=genomic.shoulder, tissueName=brain.tissue,
                                 tbl.prespecifiedRegion=specify.region())

           # previously obtained eqtls
        short.tissue.name <- sub("GTEx_V8.Brain_", "", brain.tissue)
        stopifnot(short.tissue.name %in% names(eqtl.list))
        tbl.eqtl <- eqtl.list[[short.tissue.name]]
        deleters <- which(is.na(tbl.eqtl$hg38))
        if(length(deleters) > 0)
            tbl.eqtl <- tbl.eqtl[-deleters,]

        eqtl.pval.max <- 1e-3
        tbl.eqtl <- subset(tbl.eqtl, pvalue <= eqtl.pval.max & gene==targetGene)
        stopifnot(nrow(tbl.eqtl) > 0)
        eqtls <- gwex$getEqtlsForGene(eqtl.catalog.studyIDs=brain.tissue,
                                  pval.threshold=eqtl.pval.max,
                                  tbl.eqtls.gtex=tbl.eqtl)

        unique.variants <- unique(c(eqtls$ampad$rsid, eqtls$gtex$rsid))
        length(unique.variants)   # 65

           #---------------------------------------------------------
           # now create the tms table. 2 eqtl columns should appear
           #---------------------------------------------------------

        data.dir <- "~/github/TrenaProjectAD/inst/extdata/genomicRegions"
        filename <- "boca-hg38-consensus-ATAC.RData"
        tbl.boca <- get(load(file.path(data.dir, filename)))

        tbl.tms <- gwex$trenaMultiScore(brain.tissue, tbl.fimo=tbl.fimo, tbl.oc=tbl.boca)
        dim(tbl.tms)
        print(colnames(tbl.tms))

        gene.expression.absCor.minimum <- 0.5

           #---------------------------------------------------------
           # build wild-type model
           #---------------------------------------------------------
        build.wt.model <- function(top.tfs, mtx.rna, tbl.tms.wt){
            top.tfs <- sort(unique(tbl.tms.wt$tf))
            length(top.tfs)  # 44
            tbl.trena <- gwex$runTrena(top.tfs, mtx.rna, tbl.tms.wt)
            tbl.trena$rfNorm <- round(tbl.trena$rfNorm, digits=3)
            tbl.trena
            } # build.wt.model


             #--------------------------------------------------------------
             # identify the ampad variants in these regions, and the motifs
             #--------------------------------------------------------------

        tbl.tms.wt <- subset(tbl.tms,
                             (gh > 1 | gtex.eqtl | ampad.eqtl | chip) &
                             abs(cor.all) > 0.5 &
                             fimo_pvalue < 0.00005)
        dim(tbl.tms.wt)

        gr.tfbs <- GRanges(tbl.tms.wt[, c("chrom", "start", "end")])
        tbl.ampad.eqtls <- eqtls$ampad[, c("chrom", "hg38", "hg38", "rsid", "pvalue", "genesymbol")]
        colnames(tbl.ampad.eqtls) <- c("chrom", "start", "end", "rsid", "pvalue", "genesymbol")
        gr.eqtls <- GRanges(tbl.ampad.eqtls)
        tbl.ov <- as.data.frame(findOverlaps(gr.eqtls, gr.tfbs))
        nrow(tbl.ov)
        if(nrow(tbl.ov) > 0){
           tbl.ampad.eqtls.tfbs <- tbl.ampad.eqtls[unique(sort(tbl.ov$queryHits)),]
           rsids.in.tfbs <- unique(tbl.ampad.eqtls[tbl.ov$queryHits, "rsid"])
           motif.names <- unique(tbl.tms.wt[tbl.ov[,2], "motif_id"])
           tbl.breaks <- break.motifs(rsids.in.tfbs, motif.names)
           if(nrow(tbl.breaks) == 0){
               new.filename <- sub("gwex", "gwex-FAILED", output.filename)
               tbl.trena <- data.frame()
               save(short.tissue.name, brain.tissue, eqtls, tbl.breaks, tbl.tms, tbl.tms.wt,
                    tbl.trena, file=new.filename)
               next;
               }
           new.order <- order(abs(tbl.breaks$pctDelta), decreasing=TRUE)
           tbl.breaks <- tbl.breaks[new.order,]
           rownames(tbl.breaks) <- NULL
           colnames(tbl.breaks)[1] <- "chrom"
           tbl.breaks$chrom <- as.character(tbl.breaks$chrom)
           broi <- c("chrom", "start", "end", "SNP_id", "geneSymbol", "providerId", "seqMatch", "pctRef", "pctAlt", "pctDelta")
           tbl.breaks.easy <- tbl.breaks[, broi]
           tbl.breaks.easy$start <- tbl.breaks.easy$start - 1
           dups <- which(duplicated(tbl.breaks.easy[, c("chrom", "start", "end", "SNP_id", "geneSymbol", "providerId")]))
           if(length(dups) > 0)
               tbl.breaks.easy <- tbl.breaks.easy[-dups,]
           }

        top.ranked <- table(tbl.tms.wt$tf)
        print(top.ranked)
        top.tfs <- names(top.ranked)
        length(top.tfs)  # 6

        mtx.rna <- gwex$getExpressionMatrix(brain.tissue)
        tbl.trena <- build.wt.model(top.tfs, mtx.rna, tbl.tms.wt)
        dim(tbl.trena)
        eqtl.pval.max <- 1e-4
        tbl.scored <- score.breakage(tbl.trena, tbl.ampad.eqtls, tbl.breaks)
        printf("------- results for %s", brain.tissue)
        printf("------- tbl.scored, sum top 10: %f", sum(head(tbl.scored$breakage.score, n=10)))
        printf("--- saving to %s", output.filename)
        save(short.tissue.name, brain.tissue, eqtls, tbl.tms, tbl.trena, tbl.breaks, tbl.scored, file=output.filename)
        } # for brain.tissue

} # main
#----------------------------------------------------------------------------------------------------
viz <- function()
{
   if(!exists("igv")){
      igv <- start.igv(targetGene, "hg38")
      shoulder <- 5e5
      showGenomicRegion(igv, sprintf("%s:%d-%d", tag.snp.chrom, tag.snp.hg19-shoulder, tag.snp.hg19+shoulder))
      tbl.track <- data.frame(chrom=tag.snp.chrom, start=tag.snp.hg38-1, end=tag.snp.hg38, stringsAsFactors=FALSE)
      track <- DataFrameAnnotationTrack(tag.snp, tbl.track, trackHeight=25, color="darkred")
      displayTrack(igv, track)
      threshold <- 0.5
      tbl.track <- subset(tbl.linkage, r2 >= threshold)[, c("chrom", "hg38", "hg38", "r2", "rsid")]
      colnames(tbl.track) <- c("chrom", "start", "end", "score", "rsid")
      tbl.track$chrom <- tag.snp.chrom
      tbl.track$start <- tbl.track$start - 1
      track.title <- sprintf("haploreg eur, r^2 > %3.2f", threshold)
      track <- DataFrameQuantitativeTrack(track.title, tbl.track, autoscale=FALSE, min=0, max=1, color="black")
      displayTrack(igv, track)
      }

   etx <- EndophenotypeExplorer$new(targetGene, "hg19", vcf.project="AMPAD", initialize.snpLocs=TRUE)
   tbl.eqtls <- etx$get.ampad.EQTLsForGene()
   dim(tbl.eqtls)
   head(tbl.eqtls, n=20)
   tail(tbl.eqtls, n=20)
   tbl.track <- subset(tbl.eqtls, pvalue < 1e-2 & study != "GTEx")[, c("chrom", "hg38", "hg38", "rsid", "pvalue")]
   dim(tbl.track)
   colnames(tbl.track)[2:3] <- c("start", "end")
   tbl.track$start <- tbl.track$start - 1
   colnames(tbl.track) <- NULL
   track <- GWASTrack("ampad", tbl.track, trackHeight=100) # chrom.col=0, pos.col=0, pval.col=0)
   displayTrack(igv, track)

   removeTracksByName(igv,
                      getTrackNames(igv)[5:9]
                      )


      # now broken motifs of tfs in model
   tfs.oi <- tbl.trena$gene[1:20]
   for(TF in tfs.oi){
      tbl.breaks.sub <- subset(tbl.breaks.easy, geneSymbol==TF)[, c("chrom", "start", "end", "pctDelta")]
      tbl.breaks.sub$pctDelta <- -1.0 * tbl.breaks.sub$pctDelta
      if(nrow(tbl.breaks.sub) > 0){
         track <- DataFrameQuantitativeTrack(TF, tbl.breaks.sub, color="random", autoscale=FALSE, min=-0.2, max=0.2)
         displayTrack(igv, track)
         }
      }

   dim(tbl.tms.sub)
   tfs <- sort(unique(tbl.tms.sub$tf))
   length(tfs)


   for(TF in tfs){
      tbl.track <- subset(tbl.tms.sub, tf==TF)[, c("chrom", "start", "end", "cor.all")]
      track <- DataFrameQuantitativeTrack(TF, tbl.track, color="random", autoscale=FALSE, min=-1, max=1)
      displayTrack(igv, track)
      } # for TF

} # viz
#----------------------------------------------------------------------------------------------------
# motifBreak.score <- with(tbl.tf, abs(pctDelta) * 100)
#       eqtl.score <- with(tbl.tf, -log10(gtex.eqtl.pval)* abs(gtex.eqtl.beta) * 100)
#     trena.score <- with(tbl.tf, (abs(betaLasso) * 100) + (rfNorm * 10))
#      tfbs.score <- 1/tfbs.count
# score <- trena.score * eqtl.score * motifBreak.score * tfbs.score
review <- function()
{
  files <- list.files(".", pattern="gwex.EGFR.*RData")
  files
  targetGene <- "EGFR"

  if(exists("tbl.scored")) rm(tbl.scored)

  for(file in files){
      print(load(file))
      if(exists("tbl.scored")){
         printf("%s: %f", file, sum(tbl.scored$breakage.score[1:10]))
         rm(tbl.scored)
         } # if tbl.scored
      } # for file

   file <- files[5]
   file
   print(load(file))
   breaks.coi <- c("chrom", "start", "end", "SNP_id", "pctRef", "pctAlt", "pctDelta", "geneSymbol")
   nrow(tbl.breaks)
   tbls <- list()
   tfs <- head(tbl.scored$gene, n=10)
   for(TF in tfs){
          # just the tfbs for current TF
      tbl.tms.TF <- subset(tbl.tms, tf==TF & ampad.eqtl)
      if(nrow(tbl.breaks) == 0) next;
          # just the breaks for this TF and a meaninful change in binding
          # get the whole table width in case we want to look at the values
      tbl.breaks.TF <- subset(tbl.breaks, geneSymbol==TF & abs(pctDelta) > 0.05) [, breaks.coi]
          # just the breaks which fall within previously selected tfbs
      if(nrow(tbl.breaks.TF) == 0)
          next;

      gr.tms <- GRanges(tbl.tms.TF)
      gr.breaks <- GRanges(tbl.breaks.TF)
      tbl.ov <- as.data.frame(findOverlaps(gr.breaks, gr.tms))
      if(nrow(tbl.ov) == 0)
          next;
      breaking.snps <- unique(tbl.breaks.TF$SNP_id[tbl.ov[, "queryHits"]])
          # how many TFBS for this TF?
      tfbs.count <- subset(tbl.trena, gene==TF)$tfbs
      if(nrow(tbl.tms.TF) == 0) next;
      if(nrow(tbl.breaks.TF) == 0) next;
      motifBreak.score <- with(tbl.breaks.TF, sum(abs(pctDelta)) * 100)
         # the eqtl score is the average -log10(pval) across all breaking snps
         # might want to sum these instead
      tbl.eqtl.TF <- subset(eqtls$ampad, rsid %in% breaking.snps & study=="ampad-rosmap")
      eqtl.score <-  sum(-log10(tbl.eqtl.TF$pvalue))
      trena.score <- with(subset(tbl.trena, gene==TF), (abs(betaLasso) * 100) + (rfNorm * 10))
      tfbs.count <- subset(tbl.trena, gene==TF)$tfbs
      score.mult <- max(trena.score * eqtl.score * motifBreak.score) # * tfbs.score)
      score.sum <- max(trena.score + eqtl.score + motifBreak.score) # * tfbs.score)
      #printf("%10s: %6.1f %6.1f %6.1f %d  %d", TF,
      #       trena.score, eqtl.score, motifBreak.score,
      #       as.integer(score.sum), as.integer(score.mult))
      #browser()
      tbl <- data.frame(targetGene=targetGene,
                        tf=TF,
                        tissue=short.tissue.name,
                        trena=trena.score,
                        tfbs.count=tfbs.count,
                        eqtl=eqtl.score,
                        motifBreak=motifBreak.score,
                        total=as.integer(score.mult),
                        rsids=paste(breaking.snps, collapse=" "),
                        stringsAsFactors=FALSE)
       tbls[[TF]] <- unique(tbl)
      } # for TF

    tbl.all <- do.call(rbind, tbls)
    new.order <- order(tbl.all$total, decreasing=TRUE)
    tbl.all <- tbl.all[new.order,]
    rownames(tbl.all) <- NULL

    tbl.all

     # now inspect
   TF <- "POU3F2"
   tbl.tms.TF <- subset(tbl.tms, tf==TF & ampad.eqtl)
   dim(tbl.tms.TF)
   track <- DataFrameAnnotationTrack(TF, tbl.tms.TF, color="random", trackHeight=25)
   displayTrack(igv, track)

   tbl.breaks.TF <- subset(tbl.breaks, geneSymbol==TF & abs(pctDelta) > 0.05) [, breaks.coi]
   track <- DataFrameQuantitativeTrack(sprintf("%s.rsids", TF), tbl.breaks.TF[, c(1,2,3,7)],
                                       color="brown", autoscale=FALSE, min=-0.2, max=0.2)
   displayTrack(igv, track)

} # review
#----------------------------------------------------------------------------------------------------
paul-shannon/gwasExplorer documentation built on July 16, 2022, 4:33 p.m.