docker/components/ndufs2-demo.R

library(trena)
library(ghdb)

if(!exists("igv"))
    igv <- start.igv("NDUFS2", "hg38")

ghdb <- GeneHancerDB()
targetGene <- "NDUFS2"
tbl.gh <- retrieveEnhancersFromDatabase(ghdb, targetGene, tissues="all")
tbl.gh$score <- asinh(tbl.gh$combinedscore)
track <- DataFrameQuantitativeTrack("GH", tbl.gh[, c("chrom", "start", "end", "score")],
                                    autoscale=TRUE, color="black")
displayTrack(igv, track)

data.dir <- "~/github/tms-makeAndBreak/studies/rs4575098-ndufs2-region/shared"
                # haploreg-rs4575098-0.2.tsv
                # 1883.rosmap.snps.from.fimo.region.for.motifbreakR.RData
                # tbl.fimo.NDUFS2.RData
                # gtex.brain.eqtls.chr1-160684126-161688825.RData

data.dir.2 <- "~/github/tms-makeAndBreak/studies/rs4575098-ndufs2-region/ndufs2"

   #-----------------------------------------------------------------------------------
   # create the FeatureTable, initialize with fimo results trimmed to GeneHancer range
   #----------------------------------------------------------------------------------

ft <- FeatureTable$new(target.gene=targetGene, reference.genome="hg38")
tbl.fimo <- get(load(file.path(data.dir, "tbl.fimo.NDUFS2.RData")))
dim(tbl.fimo)   # 2117704       9

                # trim to quasi-informative limits provided by GeneHancer
tbl.fimo <- subset(tbl.fimo, start > (min(tbl.gh$start) - 1000) &
                             end   < (max(tbl.gh$end)   + 1000))
dim(tbl.fimo)   #  828415      9
tbl.track <- tbl.fimo[, c("chrom", "start", "end", "p.value")]
tbl.track$p.value <- -log10(tbl.track$p.value)
track <- DataFrameQuantitativeTrack("fimo", tbl.track, autoscale=TRUE, color="darkred")
displayTrack(igv, track)
ft$setFundamentalRegions(tbl.fimo)

   #-------------------------------------------------------------------
   # add GTEx brain eQTLs for NDUFS2, cortex only
   #-------------------------------------------------------------------

tbl.eqtl.gtex <- get(load(file.path(data.dir, "gtex.brain.eqtls.chr1-160684126-161688825.RData")))
dim(tbl.eqtl.gtex)   # 1690734     8
tbl.eqtl.gtex <- subset(tbl.eqtl.gtex, gene==targetGene)
dim(tbl.eqtl.gtex)   #   29553     8
table(tbl.eqtl.gtex$id)
tbl.eqtl.gtex <- subset(tbl.eqtl.gtex, id=="GTEx_V8.Brain_Cortex" & pvalue < 0.10)
dim(tbl.eqtl.gtex)   #     391     9
tbl.eqtl.gtex$score <- -log10(tbl.eqtl.gtex$pvalue)
tbl.eqtl.gtex$start <- tbl.eqtl.gtex$hg38 - 1
tbl.eqtl.gtex$end <- tbl.eqtl.gtex$hg38

tbl.eqtl.gtex.ready <- tbl.eqtl.gtex[, c("chrom", "start", "end", "pvalue", "score", "rsid")]
track <- DataFrameQuantitativeTrack("gtex cortex",
                                    tbl.eqtl.gtex.ready[, c("chrom", "start", "end", "score")],
                                    autoscale=FALSE, min=0, max=18, color="magenta")
displayTrack(igv, track)

feature.guide <- list(gtex.eqtl.rsid="rsid", gtex.eqtl.pvalue="pvalue")
default.values <- list(gtex.eqtl.rsid="", gtex.eqtl.pvalue=1)
ft$addRegionFeature(tbl.eqtl.gtex.ready, feature.genome="hg38", feature.guide, default.values)


   #-------------------------------------------------------------------
   # add rosmap brain eQTLs for NDUFS2
   #-------------------------------------------------------------------

tbl.eqtl.rosmap <- get(load(file.path(data.dir.2, "rosmap.eqtls.RData")))
dim(tbl.eqtl.rosmap)   # 1894 14
tbl.eqtl.rosmap <- subset(tbl.eqtl.rosmap, pvalue < 0.10)
dim(tbl.eqtl.rosmap)   #  346 14
tbl.eqtl.rosmap.ready <- tbl.eqtl.rosmap[, c("chrom", "hg38", "hg38", "pvalue", "rsid")]
tbl.eqtl.rosmap.ready$score <- -log10(tbl.eqtl.rosmap$pvalue)
tbl.eqtl.rosmap.ready$start <- tbl.eqtl.rosmap$hg38 - 1
tbl.eqtl.rosmap.ready$end <- tbl.eqtl.rosmap$hg38

track <- DataFrameQuantitativeTrack("rosmap eqtl",
                                    tbl.eqtl.rosmap.ready[, c("chrom", "start", "end", "score")],
                                    autoscale=FALSE, min=0, max=18, color="random")
displayTrack(igv, track)

feature.guide <- list(rosmap.eqtl.rsid="rsid", rosmap.eqtl.pvalue="pvalue")
default.values <- list(rosmap.eqtl.rsid="", rosmap.eqtl.pvalue=1)
ft$addRegionFeature(tbl.eqtl.rosmap.ready, feature.genome="hg38", feature.guide, default.values)

   #-------------------------------------------------------------------
   # add mayo atac
   #-------------------------------------------------------------------

data.dir.3 <- "~/github/TrenaProjectAD/inst/extdata/genomicRegions"
filename <- "mayoAllPeaks.merged.96064x4.RData"
tbl.mayoAtac <- get(load(file.path(data.dir.3, filename)))
dim(tbl.mayoAtac)
tbl.mayoAtac <- subset(tbl.mayoAtac, chrom==tbl.eqtl.rosmap$chrom[1] &
                                     start >= min(tbl.eqtl.rosmap$hg38) - 1000 &
                                     end   <= max(tbl.eqtl.rosmap$hg38) + 1000)
dim(tbl.mayoAtac)  # 36 5
track <- DataFrameAnnotationTrack("mayo atac", tbl.mayoAtac, color="random")
displayTrack(igv, track)

tbl.mayoAtac$status <- TRUE   # we don't have scores, alas
feature.guide <- list(mayo.atac="status")
default.values <- list(mayo.atac=FALSE)
ft$addRegionFeature(tbl.mayoAtac, feature.genome="hg38", feature.guide, default.values)
# tbl.ft <- ft$getTable()

   #-------------------------------------------------------------------
   # add the small set of eQTL european haplotype scores from haploreg
   #-------------------------------------------------------------------

f <- system.file(package="trena", "extdata", "featureTable", "tbl.haploreg.european.rs4575098.RData")
tbl.haplo <- read.table(file.path(data.dir, "haploreg-rs4575098-0.2.tsv"), sep="\t", header=TRUE)
dim(tbl.haplo)
tbl.haplo$start <- tbl.haplo$hg38 - 1
tbl.haplo$end   <- tbl.haplo$hg38
track <- DataFrameQuantitativeTrack("haploreg", tbl.haplo[, c("chrom", "start", "end", "rSquared")],
                                    autoscale=FALSE, min=0, max=1)
displayTrack(igv, track)

feature.guide <-  list(haploreg.rsid="rsid", haploreg.rSquared="rSquared")
default.values <- list(haploreg.rsid="",     haploreg.rSquared=0)
ft$addRegionFeature(tbl.haplo, feature.genome="hg38", feature.guide, default.values)

   #-------------------------------------------------------------------------
   # add NDUFS2/TF correlated expression scores from GTEx Brain_Cortex.RData
   #-------------------------------------------------------------------------

data.dir.4 <- "~/github/TrenaProjectAD/inst/extdata/expression/gtex.v8"
mtx <- get(load(file.path(data.dir.4, "Brain_Cortex.RData")))
dim(mtx)   # 24821 206


genes.to.consider <- c(targetGene, unique(intersect(tbl.fimo$tf, rownames(mtx))))
length(genes.to.consider)
gene.correlations <- lapply(genes.to.consider, function(gene) cor(mtx[targetGene,], mtx[gene,], method="spearman", use="complete"))
tbl.cor <- data.frame(gene=genes.to.consider, cor=as.numeric(gene.correlations))
dim(tbl.cor)  # 354 2

subset(tbl.cor, abs(cor) > 0.66)
   #       gene        cor
   # 1   NDUFS2  1.0000000
   # 113 TCF7L1 -0.6641626
   # 162   RXRB  0.7452202
   # 283   NFIA -0.6654400
   # 314  FOXK2  0.6602555

feature.guide <- list(gtex.cortex.rna.cor="cor")
default.values <- list(gtex.cortex.rna.cor=0)
ft$addGeneFeature(tbl.cor, feature.name="gtex.cortex.rna.cor", default.value=0)
tbl.ft <- ft$getTable()


   #-------------------------------------------------------------------------
   # add simulated enhancer promoter interactions
   #-------------------------------------------------------------------------

library(GenomicRanges)
library(annotatr)
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
annotation.sought <- "hg38_genes_promoters"
annotations <- build_annotations(genome="hg38", annotations=annotation.sought)
gr.atac <- GRanges(tbl.mayoAtac)
tbl.ov <- as.data.frame(findOverlaps(gr.atac, annotations))


tbl.atac.promoters <- tbl.mayoAtac[tbl.ov[,1],]
keepers <- tbl.ov[,2]
tbl.atac.promoters$gene <- annotations[keepers]$symbol
rownames(tbl.atac.promoters) <- NULL
tbl.atac.promoters <- unique(subset(tbl.atac.promoters, gene==targetGene))
dim(tbl.atac.promoters) #   1 7 -> of all the mayo atac, only  chr1:161201843-16120272
                        #  is annotated as an NDUFS2 promoter
with(tbl.atac.promoters, 1 + end - start)  # 878


# generate some random enhancer/promoter links which
#    originate in mayo atac open chromatin
#    terminate within this 879 bp promoter
# how large are these regions reported to be?
#
tbl.pe <- read.table("~/github/igvR/inst/extdata/gm12878_loops.bedpe.gz", sep="\t")
colnames(tbl.pe) <- c("chr1", "start1", "end1", "chr1", "start2", "end2")
with(tbl.pe, fivenum(end1-start1))  # 5000  5000  5000 10000 10000
with(tbl.pe, fivenum(end2-start2))  # 5000  5000  5000 10000 10000

  # 	Targeted high-resolution chromosome conformation capture at genome-wide scale in mouse erythroid cells
tbl.view <- read.table("GSE160229_Capture_Viewpoints.txt.gz", sep="\t", nrow=-1)
dim(tbl.view) # 7197    4
head(tbl.view)
colnames(tbl.view) <- c("gene", "chrom", "start", "end")
tbl.view$end <- as.numeric(tbl.view$end)

fivenum(tbl.view$end-tbl.view$start)  # 69.0       461.0       745.0      1167.5 207101398.0
order(table(tbl.view$end-tbl.view$start))
   # from this, a leap:  imaginge that actual binding sites as 70 bp


tbl.atac.promoters
#      chrom     start       end width strand status   gene
#       chr1 161201843 161202721   879      *   TRUE NDUFS2
# create 20 landing sites in the promoter

starts <- sample(161201843:(161202721-70), size=nrow(tbl.mayoAtac))
tbl.landing <- data.frame(chrom="chr1", start=starts, end=(starts+70))
starts <- unlist(lapply(seq_len(nrow(tbl.mayoAtac)),
       function(r) {
           start <- tbl.mayoAtac[r, "start"];
           end   <- tbl.mayoAtac[r,"end"]
           sample(start: end-70, size=1)
           }))
tbl.launch <- data.frame(chrom="chr1", start=starts, endd=starts+70)
tbl.pe <- cbind(tbl.launch, tbl.landing)
colnames(tbl.pe) <- c("chrom1", "start1", "end1", "chrom2", "start2", "end2")
dim(tbl.pe)
random.subset <- sample(seq_len(nrow(tbl.pe)), size=5)
tbl.pe <- tbl.pe[random.subset,]
rownames(tbl.pe) <- NULL

track <- BedpeInteractionsTrack("simulated 3c", tbl.pe, color="darkgreen", trackHeight=100)
displayTrack(igv, track)

    #------------------------------------------------------------
    # any tfbs which is on either end of these paired end relations
    # deserves to be annotated to reflect that.
    # first the "launch" sites (enhancers) then the "landing" sites
    # (promoters)
    #------------------------------------------------------------

feature.guide <- list(launch.3c="status")
default.values <- list(launch.3c=FALSE)
tbl.launch <- tbl.pe[, c("chrom1", "start1", "end1")]
colnames(tbl.launch) <- c("chrom", "start", "end")
tbl.launch$status <- TRUE
ft$addRegionFeature(tbl.launch, feature.genome="hg38", feature.guide, default.values)

feature.guide <- list(landing.3c="status")
default.values <- list(landing.3c=FALSE)
tbl.landing <- tbl.pe[, c("chrom2", "start2", "end2")]
colnames(tbl.landing) <- c("chrom", "start", "end")
tbl.landing$status <- TRUE
ft$addRegionFeature(tbl.landing, feature.genome="hg38", feature.guide, default.values)

tbl.ft <- ft$getTable()
dim(tbl.ft)   # 2117704      19
tfs <- unique(tbl.ft$tf)
length(tfs)  # 535

tbl.f0 <- subset(tbl.ft, landing.3c | launch.3c & rosmap.eqtl.pvalue < 0.05)
length(unique(tbl.f0$tf))  # 275

tbl.f0 <- tbl.ft
tbl.f1 <- subset(tbl.f0, rosmap.eqtl.pvalue < 0.01)
tfs <- unique(tbl.f1$tf)
length(tfs)  # 460

tbl.f0 <- tbl.ft
tbl.f1 <- subset(tbl.f0, gtex.eqtl.pvalue < 0.01)
tfs <- unique(tbl.f1$tf)
length(tfs)  # 352

tbl.f1 <- subset(tbl.f0, gtex.eqtl.pvalue < 0.01 & rosmap.eqtl.pvalue < 0.01)
tfs <- unique(tbl.f1$tf)
length(tfs)  # 266

tbl.f2 <- subset(tbl.f1, abs(gtex.cortex.rna.cor) > 0.4 )
tfs <- unique(tbl.f2$tf)
length(tfs)  # 75

tbl.f3 <- subset(tbl.f2, mayo.atac)
tfs <- unique(tbl.f3$tf)
length(tfs)  # 5


solvers <- c("lasso", "Ridge", "Spearman", "Pearson", "RandomForest")

solver <- EnsembleSolver(mtx,  targetGene, tfs, solverNames=solvers)
system.time(tbl.trena <- run(solver))

tbl.fft <- tbl.f1
tbl.trena$tfbs <- as.integer(lapply(tbl.trena$gene, function(gene) nrow(subset(tbl.fft, tf==gene))))
gtex.eqtl.column.name <- "gtex.eqtl.rsid"

if(any(grepl(gtex.eqtl.column.name, colnames(tbl.fft)))){
   rsids.raw  <- lapply(tbl.trena$gene, function(gene) subset(tbl.fft, tf==gene)[, ..gtex.eqtl.column.name])
   rsids.by.tf <- as.character(lapply(rsids.raw, function(rsids) paste(unlist(rsids), collapse=";")))
   tbl.trena$rsids <- rsids.by.tf
   }

new.order <- order(tbl.trena$rfScore, decreasing=TRUE)
tbl.trena <- tbl.trena[new.order,]
rownames(tbl.trena) <- NULL
tbl.trena$betaLasso <- round(tbl.trena$betaLasso, digits=3)
tbl.trena$betaRidge <- round(tbl.trena$betaRidge, digits=3)
tbl.trena$spearmanCoeff <- round(tbl.trena$spearmanCoeff, digits=3)
tbl.trena$pearsonCoeff <- round(tbl.trena$pearsonCoeff, digits=3)
tbl.trena$rfScore <- round(tbl.trena$rfScore, digits=3)

wdth(1000)
tbl.trena
PriceLab/TReNA documentation built on March 21, 2023, 1:57 p.m.