inst/demos/ndufs2/ndufs2.R

library(trena)
library(ghdb)
library(igvR)

if(interactive())
   igv <- start.igv("NDUFS2", "hg38")

ghdb <- GeneHancerDB()
targetGene <- "NDUFS2"
tbl.gh <- retrieveEnhancersFromDatabase(ghdb, targetGene, tissues="all")
tbl.gh$score <- asinh(tbl.gh$combinedscore)
track <- DataFrameQuantitativeTrack("GH", tbl.gh[, c("chrom", "start", "end", "score")],
                                    autoscale=TRUE, color="black")
if(interactive()) displayTrack(igv, track)

data.dir <- "data"

   #-----------------------------------------------------------------------------------
   # create the FeatureTable, initialize with fimo results trimmed to GeneHancer range
   #----------------------------------------------------------------------------------

ft <- FeatureTable$new(target.gene=targetGene, reference.genome="hg38")
tbl.fimo <- get(load(file.path(data.dir, "tbl.fimo.NDUFS2.RData")))
dim(tbl.fimo)   # 2117704       9

                # trim to quasi-informative limits provided by GeneHancer
tbl.fimo <- subset(tbl.fimo, start > (min(tbl.gh$start) - 1000) &
                             end   < (max(tbl.gh$end)   + 1000))
dim(tbl.fimo)   #  828415      9
tbl.track <- tbl.fimo[, c("chrom", "start", "end", "p.value")]
tbl.track$p.value <- -log10(tbl.track$p.value)
track <- DataFrameQuantitativeTrack("fimo", tbl.track, autoscale=TRUE, color="darkred")
if(interactive()) displayTrack(igv, track)
ft$setFundamentalRegions(tbl.fimo)

   #-------------------------------------------------------------------
   # add GTEx brain eQTLs for NDUFS2, cortex only
   #-------------------------------------------------------------------

tbl.eqtl.gtex <- get(load(file.path(data.dir, "gtex.brain.eqtls.chr1-160684126-161688825.RData")))
dim(tbl.eqtl.gtex)   # 1690734     8
tbl.eqtl.gtex <- subset(tbl.eqtl.gtex, gene==targetGene)
dim(tbl.eqtl.gtex)   #   29553     8
table(tbl.eqtl.gtex$id)
tbl.eqtl.gtex <- subset(tbl.eqtl.gtex, id=="GTEx_V8.Brain_Cortex" & pvalue < 0.10)
dim(tbl.eqtl.gtex)   #     391     9
tbl.eqtl.gtex$score <- -log10(tbl.eqtl.gtex$pvalue)
tbl.eqtl.gtex$start <- tbl.eqtl.gtex$hg38 - 1
tbl.eqtl.gtex$end <- tbl.eqtl.gtex$hg38

tbl.eqtl.gtex.ready <- tbl.eqtl.gtex[, c("chrom", "start", "end", "pvalue", "score", "rsid")]
track <- DataFrameQuantitativeTrack("gtex cortex",
                                    tbl.eqtl.gtex.ready[, c("chrom", "start", "end", "score")],
                                    autoscale=FALSE, min=0, max=18, color="magenta")
if(interactive()) displayTrack(igv, track)

feature.guide <- list(gtex.eqtl.rsid="rsid", gtex.eqtl.pvalue="pvalue")
default.values <- list(gtex.eqtl.rsid="", gtex.eqtl.pvalue=1)
ft$addRegionFeature(tbl.eqtl.gtex.ready, feature.genome="hg38", feature.guide, default.values)


   #-------------------------------------------------------------------
   # add rosmap brain eQTLs for NDUFS2
   #-------------------------------------------------------------------

tbl.eqtl.rosmap <- get(load(file.path(data.dir, "rosmap.eqtls.RData")))
dim(tbl.eqtl.rosmap)   # 1894 14
tbl.eqtl.rosmap <- subset(tbl.eqtl.rosmap, pvalue < 0.10)
dim(tbl.eqtl.rosmap)   #  346 14
tbl.eqtl.rosmap.ready <- tbl.eqtl.rosmap[, c("chrom", "hg38", "hg38", "pvalue", "rsid")]
tbl.eqtl.rosmap.ready$score <- -log10(tbl.eqtl.rosmap$pvalue)
tbl.eqtl.rosmap.ready$start <- tbl.eqtl.rosmap$hg38 - 1
tbl.eqtl.rosmap.ready$end <- tbl.eqtl.rosmap$hg38

track <- DataFrameQuantitativeTrack("rosmap eqtl",
                                    tbl.eqtl.rosmap.ready[, c("chrom", "start", "end", "score")],
                                    autoscale=FALSE, min=0, max=18, color="random")
if(interactive()) displayTrack(igv, track)

feature.guide <- list(rosmap.eqtl.rsid="rsid", rosmap.eqtl.pvalue="pvalue")
default.values <- list(rosmap.eqtl.rsid="", rosmap.eqtl.pvalue=1)
ft$addRegionFeature(tbl.eqtl.rosmap.ready, feature.genome="hg38", feature.guide, default.values)

   #-------------------------------------------------------------------
   # add mayo atac
   #-------------------------------------------------------------------

filename <- "mayoAllPeaks.merged.96064x4.RData"
tbl.mayoAtac <- get(load(file.path(data.dir, filename)))
dim(tbl.mayoAtac)
tbl.mayoAtac <- subset(tbl.mayoAtac, chrom==tbl.eqtl.rosmap$chrom[1] &
                                     start >= min(tbl.eqtl.rosmap$hg38) - 1000 &
                                     end   <= max(tbl.eqtl.rosmap$hg38) + 1000)
dim(tbl.mayoAtac)  # 36 5
track <- DataFrameAnnotationTrack("mayo atac", tbl.mayoAtac, color="random")
if(interactive()) displayTrack(igv, track)

tbl.mayoAtac$status <- TRUE   # we don't have scores, alas
feature.guide <- list(mayo.atac="status")
default.values <- list(mayo.atac=FALSE)
ft$addRegionFeature(tbl.mayoAtac, feature.genome="hg38", feature.guide, default.values)
# tbl.ft <- ft$getTable()

   #-------------------------------------------------------------------
   # add the small set of eQTL european haplotype scores from haploreg
   #-------------------------------------------------------------------

# f <- system.file(package="trena", "extdata", "featureTable", "tbl.haploreg.european.rs4575098.RData")
tbl.haplo <- read.table(file.path(data.dir, "haploreg-rs4575098-0.2.tsv"), sep="\t", header=TRUE)
dim(tbl.haplo)
tbl.haplo$start <- tbl.haplo$hg38 - 1
tbl.haplo$end   <- tbl.haplo$hg38
track <- DataFrameQuantitativeTrack("haploreg", tbl.haplo[, c("chrom", "start", "end", "rSquared")],
                                    autoscale=FALSE, min=0, max=1)
if(interactive()) displayTrack(igv, track)

feature.guide <-  list(haploreg.rsid="rsid", haploreg.rSquared="rSquared")
default.values <- list(haploreg.rsid="",     haploreg.rSquared=0)
ft$addRegionFeature(tbl.haplo, feature.genome="hg38", feature.guide, default.values)

   #-------------------------------------------------------------------------
   # add NDUFS2/TF correlated expression scores from GTEx Brain_Cortex.RData
   #-------------------------------------------------------------------------

mtx <- get(load(file.path(data.dir, "Brain_Cortex.RData")))
dim(mtx)   # 24821 206


genes.to.consider <- c(targetGene, unique(intersect(tbl.fimo$tf, rownames(mtx))))
length(genes.to.consider)
gene.correlations <- lapply(genes.to.consider, function(gene) cor(mtx[targetGene,], mtx[gene,], method="spearman", use="complete"))
tbl.cor <- data.frame(gene=genes.to.consider, cor=as.numeric(gene.correlations))
dim(tbl.cor)  # 354 2

subset(tbl.cor, abs(cor) > 0.66)
   #       gene        cor
   # 1   NDUFS2  1.0000000
   # 113 TCF7L1 -0.6641626
   # 162   RXRB  0.7452202
   # 283   NFIA -0.6654400
   # 314  FOXK2  0.6602555

feature.guide <- list(gtex.cortex.rna.cor="cor")
default.values <- list(gtex.cortex.rna.cor=0)
ft$addGeneFeature(tbl.cor, feature.name="gtex.cortex.rna.cor", default.value=0)
tbl.ft <- ft$getTable()


   #-------------------------------------------------------------------------
   # add simulated enhancer promoter interactions
   #-------------------------------------------------------------------------

# library(GenomicRanges)
# library(annotatr)
# library(TxDb.Hsapiens.UCSC.hg38.knownGene)
# annotation.sought <- "hg38_genes_promoters"
# annotations <- build_annotations(genome="hg38", annotations=annotation.sought)
# gr.atac <- GRanges(tbl.mayoAtac)
# tbl.ov <- as.data.frame(findOverlaps(gr.atac, annotations))
#
#
# tbl.atac.promoters <- tbl.mayoAtac[tbl.ov[,1],]
# keepers <- tbl.ov[,2]
# tbl.atac.promoters$gene <- annotations[keepers]$symbol
# rownames(tbl.atac.promoters) <- NULL
# tbl.atac.promoters <- unique(subset(tbl.atac.promoters, gene==targetGene))
# dim(tbl.atac.promoters) #   1 7 -> of all the mayo atac, only  chr1:161201843-16120272
#                         #  is annotated as an NDUFS2 promoter
# with(tbl.atac.promoters, 1 + end - start)  # 878


# generate some random enhancer/promoter links which
#    originate in mayo atac open chromatin
#    terminate within this 879 bp promoter
# how large are these regions reported to be?
#
# tbl.pe <- read.table("~/github/igvR/inst/extdata/gm12878_loops.bedpe.gz", sep="\t")
# colnames(tbl.pe) <- c("chr1", "start1", "end1", "chr1", "start2", "end2")
# with(tbl.pe, fivenum(end1-start1))  # 5000  5000  5000 10000 10000
# with(tbl.pe, fivenum(end2-start2))  # 5000  5000  5000 10000 10000
#
#   # 	Targeted high-resolution chromosome conformation capture at genome-wide scale in mouse erythroid cells
# tbl.view <- read.table("GSE160229_Capture_Viewpoints.txt.gz", sep="\t", nrow=-1)
# dim(tbl.view) # 7197    4
# head(tbl.view)
# colnames(tbl.view) <- c("gene", "chrom", "start", "end")
# tbl.view$end <- as.numeric(tbl.view$end)
#
# fivenum(tbl.view$end-tbl.view$start)  # 69.0       461.0       745.0      1167.5 207101398.0
# order(table(tbl.view$end-tbl.view$start))
#    # from this, a leap:  imaginge that actual binding sites as 70 bp
#
#
# tbl.atac.promoters
#      chrom     start       end width strand status   gene
#       chr1 161201843 161202721   879      *   TRUE NDUFS2
# create 20 landing sites in the promoter

# starts <- sample(161201843:(161202721-70), size=nrow(tbl.mayoAtac))
# tbl.landing <- data.frame(chrom="chr1", start=starts, end=(starts+70))
# starts <- unlist(lapply(seq_len(nrow(tbl.mayoAtac)),
#        function(r) {
#            start <- tbl.mayoAtac[r, "start"];
#            end   <- tbl.mayoAtac[r,"end"]
#            sample(start: end-70, size=1)
#            }))
# tbl.launch <- data.frame(chrom="chr1", start=starts, endd=starts+70)
# tbl.pe <- cbind(tbl.launch, tbl.landing)
# colnames(tbl.pe) <- c("chrom1", "start1", "end1", "chrom2", "start2", "end2")
# dim(tbl.pe)
# random.subset <- sample(seq_len(nrow(tbl.pe)), size=5)
# tbl.pe <- tbl.pe[random.subset,]
# rownames(tbl.pe) <- NULL
#
tbl.pe <- get(load(file.path(data.dir, "tbl.pairedEnd.small.NDUFS2.RData")))

track <- BedpeInteractionsTrack("simulated 3c", tbl.pe, color="darkgreen", trackHeight=100)
if(interactive()) displayTrack(igv, track)

    #------------------------------------------------------------
    # any tfbs which is on either end of these paired end relations
    # deserves to be annotated to reflect that.
    # first the "launch" sites (enhancers) then the "landing" sites
    # (promoters)
    #------------------------------------------------------------

feature.guide <- list(launch.3c="status")
default.values <- list(launch.3c=FALSE)
tbl.launch <- tbl.pe[, c("chrom1", "start1", "end1")]
colnames(tbl.launch) <- c("chrom", "start", "end")
tbl.launch$status <- TRUE
ft$addRegionFeature(tbl.launch, feature.genome="hg38", feature.guide, default.values)

feature.guide <- list(landing.3c="status")
default.values <- list(landing.3c=FALSE)
tbl.landing <- tbl.pe[, c("chrom2", "start2", "end2")]
colnames(tbl.landing) <- c("chrom", "start", "end")
tbl.landing$status <- TRUE
ft$addRegionFeature(tbl.landing, feature.genome="hg38", feature.guide, default.values)


tbl.ft <- ft$getTable()
dim(tbl.ft)   # 2117704      19
tfs <- unique(tbl.ft$tf)
length(tfs)  # 535

  # tbl.fft: FilteredFeatureTable

tbl.fft <- subset(tbl.ft, gtex.eqtl.pvalue < 0.01 & rosmap.eqtl.pvalue < 0.01)
tfs <- unique(tbl.fft$tf)
length(tfs)  # 266


solvers <- c("lasso", "Ridge", "Spearman", "Pearson", "RandomForest")

solver <- EnsembleSolver(mtx,  targetGene, tfs, solverNames=solvers)
system.time(tbl.trena <- run(solver))

tbl.trena$tfbs <- as.integer(lapply(tbl.trena$gene, function(gene) nrow(subset(tbl.fft, tf==gene))))
gtex.eqtl.column.name <- "gtex.eqtl.rsid"

if(any(grepl(gtex.eqtl.column.name, colnames(tbl.fft)))){
   rsids.raw  <- lapply(tbl.trena$gene, function(gene) subset(tbl.fft, tf==gene)[, ..gtex.eqtl.column.name])
   rsids.by.tf <- as.character(lapply(rsids.raw, function(rsids) paste(unlist(rsids), collapse=";")))
   tbl.trena$rsids <- rsids.by.tf
   }

new.order <- order(tbl.trena$rfScore, decreasing=TRUE)
tbl.trena <- tbl.trena[new.order,]
rownames(tbl.trena) <- NULL
tbl.trena$betaLasso <- round(tbl.trena$betaLasso, digits=3)
tbl.trena$betaRidge <- round(tbl.trena$betaRidge, digits=3)
tbl.trena$spearmanCoeff <- round(tbl.trena$spearmanCoeff, digits=3)
tbl.trena$pearsonCoeff <- round(tbl.trena$pearsonCoeff, digits=3)
tbl.trena$rfScore <- round(tbl.trena$rfScore, digits=3)

wdth(1000)
print(tbl.trena)
PriceLab/TReNA documentation built on March 21, 2023, 1:57 p.m.