Nothing
library(TReNA)
library(RSQLite)
library(RPostgreSQL)
library(RUnit)
library(MotifDb); mdb <- MotifDb # convenience variable
library(splitstackshape)
#------------------------------------------------------------------------------------------------------------------------
fp.file <- "/Users/paul/s/work/priceLab/ohsu-aquaporin/aqp4.sqlite"
footprint.db.uri <- sprintf("sqlite://%s", fp.file);
fp.uri <- sprintf("sqlite://%s", fp.file)
genome.db.uri <- "postgres://bddsrds.globusgenomics.org/hg38"
#------------------------------------------------------------------------------------------------------------------------
tbl.mg <- read.table("~/github/TReNA/inst/extdata/motifGenes.tsv", sep="\t", as.is=TRUE, header=TRUE)
#------------------------------------------------------------------------------------------------------------------------
# if we need to do direct inspeciont
# db.fp <- dbConnect(dbDriver("SQLite"), fp.file)
#------------------------------------------------------------------------------------------------------------------------
tss <- 26865884 # minus strand
tssUpstream <- 500
tssDownstream <- 500
target.gene <- "AQP4"
#------------------------------------------------------------------------------------------------------------------------
if(!exists("mtx")){
load("~/github/projects/examples/microservices/trenaGeneModel/datasets/coryAD/rosmap_counts_matrix_normalized_geneSymbols_25031x638.RData")
#load(system.file(package="TReNA", "extdata/ampAD.154genes.mef2cTFs.278samples.RData"))
mtx <- asinh(mtx)
mtx.var <- apply(mtx, 1, var)
deleters <- which(mtx.var < 0.01)
if(length(deleters) > 0) # 15838 x 638
mtx <- mtx[-deleters,]
mtx <<- mtx
}
#------------------------------------------------------------------------------------------------------------------------
genome.name <- "hg38"
tbl.snp <- data.frame(target.gene=rep("AQP4", 5),
chromosome=rep("chr18", 5),
loc=c(26864410, 26865469, 26855623, 26855854, 26850565),
snp=c("rs3763040", "rs3875089", "rs335929", "rs3763043", "rs9951307"),
shoulder=rep(2000, 5),
genome=rep(genome.name, 5),
stringsAsFactors=FALSE)
#------------------------------------------------------------------------------------------------------------------------
find.footprints <- function(chrom, start, end)
{
fp <- FootprintFinder(genome.db.uri, fp.uri, quiet=TRUE)
tbl.fp <- getFootprintsInRegion(fp, chrom, start, end)
} # find.footprints
#------------------------------------------------------------------------------------------------------------------------
find.snps.in.footprints <- function()
{
for(r in 1:nrow(tbl.snp)){
chrom <- tbl.snp$chromosome[r]
loc <- tbl.snp$loc[r]
shoulder <- 20
query <- sprintf("select * from regions where chrom='%s' and start < %d and endpos > %d",
chrom, loc+shoulder, loc-shoulder)
printf("---- %s: %d", tbl.snp$snp[r], tbl.snp$loc[r])
print(dbGetQuery(db.fp, query))
} # for r
} # find.snps.in.footprints
#------------------------------------------------------------------------------------------------------------------------
getFootprintCandidates <- function(target.gene, tssUpstream, tssDownstream)
{
regionsSpec <- "chr18:26865459-26865479"
#regionsSpec <- list()
#geneCenteredSpec <- list(targetGene=target.gene, tssUpstream=tssUpstream, tssDownstream=tssDownstream)
geneCenteredSpec <- list();
filterSpec <- list(genomeDB=genome.db.uri,
footprintDB=footprint.db.uri,
geneCenteredSpec=geneCenteredSpec,
regionsSpec=regionsSpec)
filter <- FootprintFilter(filterSpec$genomeDB,
filterSpec$footprintDB,
filterSpec$geneCenteredSpec,
filterSpec$regionsSpec,
quiet=TRUE)
getCandidates(filter)
} # getFootprintCandidates
#------------------------------------------------------------------------------------------------------------------------
getDHSCandidates <- function(target.gene, tssUpstream, tssDownstream)
{
genome <- "hg38"
regionsSpec <- "chr18:26865459-26865479"
#regionsSpec <- NA_character_;
#geneCenteredSpec <- list(targetGene=target.gene, tssUpstream=tssUpstream, tssDownstream=tssDownstream)
geneCenteredSpec <- list()
#variants <- NA_character_
variants <- "rs3875089"
filterSpec <- list(filterType="EncodeDNaseClusters",
genomeName=genome,
encodeTableName="wgEncodeRegDnaseClustered",
pwmMatchPercentageThreshold=75L,
geneInfoDB="postgres://whovian/gtf",
geneCenteredSpec=geneCenteredSpec,
regionsSpec=regionsSpec,
variants=variants)
filter <- with(filterSpec, HumanDHSFilter(genomeName,
encodeTableName=encodeTableName,
pwmMatchPercentageThreshold=85L,
geneInfoDatabase.uri=geneInfoDB,
regionsSpec=regionsSpec,
geneCenteredSpec=geneCenteredSpec,
quiet=FALSE))
getCandidates(filter)
} # getDHSCandidates
#------------------------------------------------------------------------------------------------------------------------
#createModel <- function(target.gene, tssUpstream, tssDownstream)
createModel <- function(target.gene, chrom, start, end, variants=NA_character_)
{
#geneCenteredSpec <- list(targetGene=target.gene, tssUpstream=tssUpstream, tssDownstream=tssDownstream)
geneCenteredSpec <- list()
#regionsSpec <- list()
regionsSpec <- sprintf("%s:%d-%d", chrom, start, end)
filterSpec <- list(genomeDB=genome.db.uri,
footprintDB=footprint.db.uri,
geneCenteredSpec=geneCenteredSpec,
regionsSpec=regionsSpec)
filter <- FootprintFilter(filterSpec$genomeDB,
filterSpec$footprintDB,
filterSpec$geneCenteredSpec,
filterSpec$regionsSpec,
quiet=TRUE)
x.fp <- getCandidates(filter)
genome <- "hg38"
#regionsSpec <- "chr18:26865459-26865479"
#regionsSpec <- NA_character_;
#geneCenteredSpec <- list(targetGene=target.gene, tssUpstream=tssUpstream, tssDownstream=tssDownstream)
#geneCenteredSpec <- list()
filterSpec <- list(filterType="EncodeDNaseClusters",
genomeName=genome,
encodeTableName="wgEncodeRegDnaseClustered",
pwmMatchPercentageThreshold=75L,
geneInfoDB="postgres://whovian/gtf",
geneCenteredSpec=geneCenteredSpec,
regionsSpec=regionsSpec,
variants=variants)
filter <- with(filterSpec, HumanDHSFilter(genomeName,
encodeTableName=encodeTableName,
pwmMatchPercentageThreshold=90L,
geneInfoDatabase.uri=geneInfoDB,
regionsSpec=regionsSpec,
geneCenteredSpec=geneCenteredSpec,
variants=variants,
quiet=FALSE))
x.dhs <- getCandidates(filter)
tfs.fp <- intersect(rownames(mtx), x.fp$tfs)
if(length(tfs.fp) > 0){
solver.wt <- RandomForestSolver(mtx, targetGene=target.gene, candidateRegulators=tfs.fp)
model.fp <- run(solver.wt)
}
else{
printf("no footprint-derived tfs")
x.fp <- list(tfs=NA_character_, tbl=data.frame())
model.fp <- list()
}
tfs.dhs <- intersect(rownames(mtx), x.dhs$tfs)
solver.wt <- RandomForestSolver(mtx, targetGene=target.gene, candidateRegulators=tfs.dhs)
model.dhs <- run(solver.wt)
return(list(candidates.fp=x.fp, model.fp=model.fp, candidates.dhs=x.dhs, model.dhs=model.dhs))
} # createModel
#------------------------------------------------------------------------------------------------------------------------
createDHSModel <- function(target.gene, chrom, start, end, pwmMatchThreshold=80L, variants=NA_character_)
{
regionsSpec <- sprintf("%s:%d-%d", chrom, start, end)
geneCenteredSpec <- list()
genome <- "hg38"
filterSpec <- list(filterType="EncodeDNaseClusters",
genomeName=genome,
encodeTableName="wgEncodeRegDnaseClustered",
pwmMatchPercentageThreshold=pwmMatchThreshold,
geneInfoDB="postgres://whovian/gtf",
geneCenteredSpec=geneCenteredSpec,
regionsSpec=regionsSpec,
variants=variants)
filter <- with(filterSpec, HumanDHSFilter(genomeName,
encodeTableName=encodeTableName,
pwmMatchPercentageThreshold=90L,
geneInfoDatabase.uri=geneInfoDB,
regionsSpec=regionsSpec,
geneCenteredSpec=geneCenteredSpec,
variants=variants,
quiet=FALSE))
x.dhs <- getCandidates(filter)
tfs.dhs <- intersect(rownames(mtx), x.dhs$tfs)
solver.wt <- RandomForestSolver(mtx, targetGene=target.gene, candidateRegulators=tfs.dhs)
model.dhs <- run(solver.wt)
return(list(candidates=x.dhs, model=model.dhs))
} # createDHSModel
#------------------------------------------------------------------------------------------------------------------------
# 18:26865469 A/G
test.createModel <- function()
{
# chr18:26,860,742-26,882,685: includes all footprints and dhs clusters
# chr18:26,860,992-26,870,492: a 10kb region enriched with brain hint footprints
# chr18:26,864,303-26,866,143: a 1.8kb region including tss
start <- 26864303 #26860992 #26860742
end <- 26866143 #26870492 # 26882685
m1 <- createModel("AQP4", "chr18", start, end)
m1.mut <- createModel("AQP4", "chr18", start, end, variants="rs3875089")
# chr18:26865450-26865480: 30 bases around rs3875089
start <- 26865450
end <- 26865480
m2 <- createModel("AQP4", "chr18", start, end)
m2.mut <- createModel("AQP4", "chr18", start, end, variants="rs3875089")
if(length(m$model.fp) > 0){
head(m$model.dhs$edges)
# IncNodePurity gene.cor
# TEAD1 39.052237 0.7605383
# ATF7 19.679876 0.7163899
# SMAD9 15.450478 0.6694617
# SP3 13.706970 0.6837160
# NFE2L2 9.666957 0.6886920
# GLI2 9.514838 0.5750392
m$candidates.dhs$tbl[grep("TEAD", m$candidates.fp$tbl$tf),]
# chrom start end motifName length strand score1 score2 score3 tf
# 5851 chr18 26865890 26865897 MA0808.1 8 - 9 12.3265 6.51e-05 TEAD3;TEAD1;TEAD2;TEAD4
} # if fp results
head(m$model.dhs$edges)
m$candidates.dhs$tbl[grep("TEAD", m$candidates.dhs$tbl$tf),]
tbl.around.snp2 <- subset(m$candidates.dhs$tbl, motifStart<tbl.snp$loc[2] & motifEnd > tbl.snp$loc[2])
tfs.big <- rownames(subset(m$model.dhs$edges, IncNodePurity > 5))
# which of these high value tfs have a motif that includes snp2?
tbl.around.snp2[unlist(lapply(tfs.big, function(tf) grep(tf, tbl.around.snp2$tfs))),]
# motifName chrom motifStart motifEnd strand motifScore motifRelativeScore match regulatoryRegionStart regualtoryRegionEnd regulatorySequence variant tfs
# 17 MA0090.2 chr18 26865465 26865474 + 5.653203 0.8174939 AGCATCCCTT 26865450 26865480 AGGATTTGGCTAAAAAG... wt TEAD1;TEAD2;TEAD3;TEAD4
# 111 MA0808.1 chr18 26865466 26865473 + 5.564402 0.8368466 GCATCCCT 26865450 26865480 AGGATTTGGCTAAAAAG... wt TEAD3;TEAD1;TEAD2;TEAD4
# 112 MA0809.1 chr18 26865465 26865474 + 5.470297 0.8060669 AGCATCCCTT 26865450 26865480 AGGATTTGGCTAAAAAG... wt TEAD4;TEAD1;TEAD2;TEAD3
# 16 MA0084.1 chr18 26865468 26865476 - 4.535714 0.7016575 TTTGGCTAA 26865450 26865480 AGGATTTGGCTAAAAAG... wt SRY;SOX30;SOX15;SOX7;SOX17;SOX18;SOX8;SOX9;SOX10;SOX5;SOX6;SOX13;SOX4;SOX11;SOX12;SOX1;SOX2;SOX3;SOX14;SOX21
# 16.1 MA0084.1 chr18 26865468 26865476 - 4.535714 0.7016575 TTTGGCTAA 26865450 26865480 AGGATTTGGCTAAAAAG... wt SRY;SOX30;SOX15;SOX7;SOX17;SOX18;SOX8;SOX9;SOX10;SOX5;SOX6;SOX13;SOX4;SOX11;SOX12;SOX1;SOX2;SOX3;SOX14;SOX21
# 56 MA0597.1 chr18 26865467 26865475 + 4.070352 0.7155477 CATCCCTTT 26865450 26865480 AGGATTTGGCTAAAAAG... wt THAP1;THAP10;THAP11;PRKRIR;THAP2;THAP3;THAP4;THAP5;THAP6;THAP7;THAP8;THAP9
# 130 MA0886.1 chr18 26865463 26865472 - 5.230989 0.7422321 GCTAAAAAGC 26865450 26865480 AGGATTTGGCTAAAAAG... wt EMX2;NANOG;NOTO;VENTX;BSX;HHEX;HLX;EN1;EN2;EMX1;DLX1;DLX2;DLX3;DLX4;DLX5;DLX6;DBX1;DBX2;VAX1;VAX2;TLX1;TLX2;TLX3;BARX1;BARX2;HMX1;HMX2;HMX3;MSX1;MSX2;LBX1;LBX2;BARHL1;BARHL2;NR2E1
# 87 MA0710.1 chr18 26865463 26865472 - 5.096429 0.7505068 GCTAAAAAGC 26865450 26865480 AGGATTTGGCTAAAAAG... wt NOTO;NANOG;VENTX;BSX;HHEX;HLX;EN1;EN2;EMX1;EMX2;DLX1;DLX2;DLX3;DLX4;DLX5;DLX6;DBX1;DBX2;VAX1;VAX2;TLX1;TLX2;TLX3;BARX1;BARX2;HMX1;HMX2;HMX3;MSX1;MSX2;LBX1;LBX2;BARHL1;BARHL2;NR2E1
# 81 MA0699.1 chr18 26865463 26865472 - 4.596502 0.7295830 GCTAAAAAGC 26865450 26865480 AGGATTTGGCTAAAAAG... wt LBX2;NANOG;NOTO;VENTX;BSX;HHEX;HLX;EN1;EN2;EMX1;EMX2;DLX1;DLX2;DLX3;DLX4;DLX5;DLX6;DBX1;DBX2;VAX1;VAX2;TLX1;TLX2;TLX3;BARX1;BARX2;HMX1;HMX2;HMX3;MSX1;MSX2;LBX1;BARHL1;BARHL2;NR2E1
# 127 MA0879.1 chr18 26865463 26865472 - 4.369845 0.7033855 GCTAAAAAGC 26865450 26865480 AGGATTTGGCTAAAAAG... wt DLX1;NANOG;NOTO;VENTX;BSX;HHEX;HLX;EN1;EN2;EMX1;EMX2;DLX2;DLX3;DLX4;DLX5;DLX6;DBX1;DBX2;VAX1;VAX2;TLX1;TLX2;TLX3;BARX1;BARX2;HMX1;HMX2;HMX3;MSX1;MSX2;LBX1;LBX2;BARHL1;BARHL2;NR2E1
# 122 MA0876.1 chr18 26865464 26865471 - 4.064815 0.7189870 CTAAAAAG 26865450 26865480 AGGATTTGGCTAAAAAG... wt BSX;NANOG;NOTO;VENTX;HHEX;HLX;EN1;EN2;EMX1;EMX2;DLX1;DLX2;DLX3;DLX4;DLX5;DLX6;DBX1;DBX2;VAX1;VAX2;TLX1;TLX2;TLX3;BARX1;BARX2;HMX1;HMX2;HMX3;MSX1;MSX2;LBX1;LBX2;BARHL1;BARHL2;NR2E1
# next up: do with the tbl.snp[2,] variant. does these tfs.big drop out?
} # test.createModel
#------------------------------------------------------------------------------------------------------------------------
createModels <- function(spec, mtx)
{
stopifnot(all(c("target.gene", "chromosome", "loc", "snp", "shoulder", "genome") %in% names(spec)))
regionsSpec <- with(spec, sprintf("%s:%d-%d", chromosome, loc-shoulder, loc+shoulder))
dhsFilterSpec <- list(filterType="EncodeDNaseClusters",
genomeName=spec$genome,
encodeTableName="wgEncodeRegDnaseClustered",
pwmMatchPercentageThreshold=80L,
geneInfoDB="postgres://whovian/gtf",
geneCenteredSpec=list(),
regionsSpec=c(regionsSpec),
variants=spec$snp)
hdf.wt <- with(dhsFilterSpec,
HumanDHSFilter(genomeName=genomeName,
encodeTableName=encodeTableName,
pwmMatchPercentageThreshold=pwmMatchPercentageThreshold,
geneCenteredSpec=geneCenteredSpec,
geneInfoDatabase.uri=geneInfoDB,
regionsSpec=regionsSpec,
quiet=TRUE))
x.wt <- getCandidates(hdf.wt)
# find any motif matches, in regulatory regions, which contain the variant site
subset(x.wt$tbl, motifStart < spec$loc & motifEnd > spec$loc)
hdf.mut <- with(dhsFilterSpec,
HumanDHSFilter(genomeName=genomeName,
encodeTableName=encodeTableName,
pwmMatchPercentageThreshold=pwmMatchPercentageThreshold,
geneCenteredSpec=geneCenteredSpec,
geneInfoDatabase.uri=geneInfoDB,
regionsSpec=regionsSpec,
variants=variants,
quiet=TRUE))
x.mut <- getCandidates(hdf.mut)
# find any motif matches, in regulatory regions, which contain the variant site
#subset(x.wt$tbl, motifStart <= spec$loc & motifEnd >= spec$loc)
#subset(x.mut$tbl, motifStart <= spec$loc & motifEnd >= spec$loc)
tfs.wt <- intersect(rownames(mtx), x.wt$tfs)
tfs.mut <- intersect(rownames(mtx), x.mut$tfs)
identical.tfs <- (length(tfs.wt) == length(tfs.mut)) & all(tfs.wt %in% tfs.mut)
printf("%d wt tfs, %d mut tfs, identical? %s", length(tfs.wt), length(tfs.mut), identical.tfs)
model.wt <- data.frame()
model.mut <- data.frame()
if(length(tfs.wt) > 0){
solver.wt <- RandomForestSolver(mtx, targetGene=spec$target.gene, candidateRegulators=tfs.wt)
model.wt <- run(solver.wt)
}
else{
printf("no candidate tfs for wt")
}
if(length(tfs.mut) > 0){
solver.mut <- RandomForestSolver(mtx, targetGene=spec$target.gene, candidateRegulators=tfs.mut)
model.mut <- run(solver.mut)
}
else{
printf("no candidate tfs for wt")
}
list(wt.candidates=x.wt, mut.candidates=x.mut, wt.model=model.wt, mut.model=model.mut)
} # createModels
#------------------------------------------------------------------------------------------------------------------------
explore.aqp4 <- function()
{
start <- 26864303 #26860992 #26860742
end <- 26866143 #26870492 # 26882685
start.300bp <- 26865352
end.300bp <- 26865619
start <- start.300bp
end <- end.300bp
#m1 <- createDHSModel("AQP4", "chr18", start, end, 95L)
tv <- TReNA.Viz(portRange=11011:11051)
# addBedTrackFromHostedFile(tv,
# trackName="brain HINT",
# uri="http://pshannon.systemsbiology.net/annotations/brain_hint.bed.gz",
# index.uri="http://pshannon.systemsbiology.net/annotations/brain_hint.bed.gz.tbi",
# displayMode="SQUISHED",
# color="blue")
# addBedTrackFromHostedFile(tv,
# trackName="skin HINT",
# uri="http://pshannon.systemsbiology.net/annotations/skin_hint.bed.gz",
# index.uri="http://pshannon.systemsbiology.net/annotations/skin_hint.bed.gz.tbi",
# displayMode="SQUISHED",
# color="blue")
addBedTrackFromHostedFile(tv,
trackName="EncodeDHSclustered",
uri="http://pshannon.systemsbiology.net/annotations/dhsClusters_hg38.bed.gz",
index.uri="http://pshannon.systemsbiology.net/annotations/dhsClusters_hg38.bed.gz.tbi",
displayMode="SQUISHED",
color="blue")
addBedTrackFromHostedFile(tv,
trackName="AQP4 snps",
uri="http://pshannon.systemsbiology.net/annotations/aqp4-chr18-snps.bed",
index.uri=NA,
displayMode="SQUISHED",
color="red")
#tbl.bed <- m1$candidates$tbl[, c("chrom", "motifStart", "motifEnd", "motifName", "motifScore")]
#addBedTrackFromDataFrame(tv, "DHS motifs", tbl.bed, color="green")
tv
} # explore.aqp4
#----------------------------------------------------------------------------------------------------
displayAllTracks <- function(chrom, start, end)
{
if(!exists("tv"))
tv <<- explore.aqp4()
if(!exists("db"))
db <<- dbConnect(PostgreSQL(), user= "trena", password="trena", host="whovian")
dbNames <- c("brain_hint", "brain_wellington",
"lymphoblast_hint", "lymphoblast_wellington",
"skin_hint", "skin_wellington")
actual.dbNames <- grep("_", dbGetQuery(db, "select datname from pg_database")[, 1], value=TRUE)
dbNames <- intersect(dbNames, actual.dbNames)
genome.db.uri <- "postgres://whovian/hg38"
# chrom <- "chr18"
# # based on all 5 snps
# start.17kb.all.snps <- 26849000
# end.17kb.all.snps <- 26866000
# # based on tss
# start <- tss - 2000
# end <- tss + 2000
# start <- 26847227
# end <- 26869162
# start.81kb <- 26824211
# end.81kb <- 26905511
# start <- start.81kb
# end <- end.81kb
showRegion(tv, sprintf("%s:%d-%d", chrom, start, end))
tbl.regions <- data.frame();
load("aqp4_fp.RData")
tbl.cory <- fp$AQP4$tbl[, c(3, 4, 5, 1, 15)]
tbl.cory$source <- "cory"
tbl.regions <- rbind(tbl.regions, tbl.cory)
addBedTrackFromDataFrame(tv, "coryFP", tbl.cory, displayMode="COLLAPSED", color="cyan")
for(dbName in dbNames){
printf("--- querying %s", dbName)
fp.db.uri <- sprintf("postgres://whovian/%s", dbName)
fpf <- FootprintFinder(genome.db.uri, fp.db.uri)
tbl.fp <- getFootprintsInRegion(fpf, chrom, start, end)
printf("%40s: %5d footprints", dbName, nrow(tbl.fp))
if(nrow(tbl.fp) > 0){
tbl.bed <- tbl.fp[, c("chrom", "start", "endpos", "name", "score2")]
tbl.bed$source <- dbName
tbl.regions <- rbind(tbl.regions, tbl.bed)
addBedTrackFromDataFrame(tv, dbName, tbl.bed, displayMode="COLLAPSED", color="green")
}
closeDatabaseConnections(fpf)
} # for dbName
addBedTrackFromDataFrame(tv, "fp.all", tbl.regions, color="orange")
save(tbl.regions, file="all.aqp4.footprints.RData")
invisible(tbl.regions)
} # displayAllTracks
#----------------------------------------------------------------------------------------------------
getCandidates <- function(tbl.regions)
{
i <- grep("endpos", colnames(tbl.regions))
if(length(i) > 0)
colnames(tbl.regions)[i] <- "end"
motifMatcher <- MotifMatcher(name="mm", genomeName="hg38", quiet=FALSE)
#x <- findMatchesByChromosomalRegion(motifMatcher, tbl.regions, pwmMatchMinimumAsPercentage=92)
tbl.regions.single <- data.frame(chrom="chr18", start=26863884, end=26867884, stringsAsFactors=FALSE)
x92 = findMatchesByChromosomalRegion(motifMatcher, tbl.regions.single, pwmMatchMinimumAsPercentage=92)
x70 = findMatchesByChromosomalRegion(motifMatcher, tbl.regions.single, pwmMatchMinimumAsPercentage=70)
print(system.time(x2 <- findMatchesByChromosomalRegion(motifMatcher, tbl.regions.single, pwmMatchMinimumAsPercentage=72)))
tfs <- intersect(rownames(mtx), x92$tfs)
solver.wt <- RandomForestSolver(mtx, targetGene="AQP4", candidateRegulators=tfs)
model.wt <- run(solver.wt)
snps <- tbl.snp$snp
x92.snps = findMatchesByChromosomalRegion(motifMatcher, tbl.regions.single, pwmMatchMinimumAsPercentage=92, variants=snps)
} # getCandidates
#----------------------------------------------------------------------------------------------------
newFriday.rs3875089 <- function()
{
mm <- MotifMatcher(name="mm", genomeName="hg38", quiet=FALSE)
rsid <- "rs3875089"
loc <- subset(tbl.snp, snp==rsid)$loc
tbl.regions.noSeq <- data.frame(chrom="chr18",
start=loc - 10,
end=loc + 10,
stringsAsFactors=FALSE)
tbl.wt <- getSequence(mm, tbl.regions.noSeq)
tbl.mut <- getSequence(mm, tbl.regions.noSeq, rsid)
checkTrue(tbl.wt$seq != tbl.mut$seq)
x.wt <- findMatchesByChromosomalRegion(mm, tbl.regions.noSeq, pwmMatchMinimumAsPercentage=80)
x.mut <- findMatchesByChromosomalRegion(mm, tbl.regions.noSeq, pwmMatchMinimumAsPercentage=80, rsid)
} # newFriday.rs3875089
#----------------------------------------------------------------------------------------------------
identifyMotifs <- function(chrom="chr18", start=26865324, end=26866747)
{
mm <- MotifMatcher(name="mm", genomeName="hg38", quiet=FALSE)
tbl.regions.noSeq <- data.frame(chrom=chrom,
start=start, # min(tbl.snp$loc) - 100,
end=end, #max(tbl.snp$loc) + 5000,
stringsAsFactors=FALSE)
tbl.wt <- getSequence(mm, tbl.regions.noSeq)
tbl.mut <- getSequence(mm, tbl.regions.noSeq, tbl.snp$snp)
checkEquals(nchar(tbl.wt$seq), nchar(tbl.mut$seq))
for(i in 1:nchar(tbl.mut$seq)){
wt.base <- substr(tbl.wt$seq, i, i)
mut.base <- substr(tbl.mut$seq, i, i)
if(wt.base != mut.base)
printf("variant at position %d: %s -> %s", i, wt.base, mut.base)
} # for i
x.wt <- findMatchesByChromosomalRegion(mm, tbl.regions.noSeq, pwmMatchMinimumAsPercentage=80)
x.mut <- findMatchesByChromosomalRegion(mm, tbl.regions.noSeq, pwmMatchMinimumAsPercentage=80, variants=tbl.snp$snp)
tbl.wt.freq <- as.data.frame(table(x.wt$tbl$motifName))
tbl.wt.freq <- tbl.wt.freq[order(tbl.wt.freq$Freq, decreasing=TRUE),]
colnames(tbl.wt.freq) <- c("motif", "wtCount")
tbl.mut.freq <- as.data.frame(table(x.mut$tbl$motifName))
tbl.mut.freq <- tbl.mut.freq[order(tbl.mut.freq$Freq, decreasing=TRUE),]
colnames(tbl.mut.freq) <- c("motif", "mutCount")
# "MA0751.1" lost in
tbl.counts <- merge(tbl.wt.freq, tbl.mut.freq, by="motif", all=TRUE);
tbl.counts$mutCount[is.na(tbl.counts$mutCount)] <- 0
tbl.counts$wtCount[is.na(tbl.counts$wtCount)] <- 0
tbl.counts$diff <- tbl.counts$wtCount - tbl.counts$mutCount
tbl.counts <- tbl.counts[order(abs(tbl.counts$diff), decreasing=TRUE),]
tbl.mg <- read.table("~/github/TReNA/inst/extdata/motifGenes.tsv", sep="\t", as.is=TRUE, header=TRUE)
tbl.counts$motif <- as.character(tbl.counts$motif)
tfs <- unlist(lapply(tbl.counts$motif, function(m) paste(subset(tbl.mg, motif==m)$tf.gene, collapse=";")))
tbl.counts$tfs <- tfs
mapMotifToTF <- function(motif){
motif.stem <- strsplit(motif, ".", fixed=TRUE)[[1]][1]
motif.stem.regex <- sprintf("%s\\.", motif.stem)
tfs <- unique(mcols(query(mdb, motif.stem.regex))$geneSymbol)
paste(tfs, collapse=";")
}
mdb.tfs <- unlist(lapply((tbl.counts$motif), mapMotifToTF))
tbl.counts$motifDB.tfs <- mdb.tfs
save(tbl.counts, file="aqp4.tbl.counts.5kbUpstream.RData")
wt.ma0090.locs <- unique(with(subset(x.wt$tbl, motifName=="MA0090.2"), sprintf("%s:%d-%d", chrom, motifStart, motifEnd)))
mut.ma0090.locs <- unique(with(subset(x.mut$tbl, motifName=="MA0090.2"), sprintf("%s:%d-%d", chrom, motifStart, motifEnd)))
setdiff(wt.ma0090.locs, mut.ma0090.locs) # [1] "chr18:26855847-26855856" "chr18:26865465-26865474"
subset(x.wt$tbl, (motifStart==26855847 | motifStart==26865465) & motifName=="MA0090.2")
# motifName chrom motifStart motifEnd strand motifScore motifRelativeScore match chromStart chromEnd seq status tf
# |
# 3462 MA0090.2 chr18 26855847 26855856 + 5.734231 0.8292111 AATATTCCAG 26850465 26865569 CCCATATATATGCTCACAATTGATAATTATTCTAATG... wt TEAD1;TEAD2;TEAD3;TEAD4
# |
# 3481 MA0090.2 chr18 26865465 26865474 + 5.653203 0.8174939 AGCATCCCTT 26850465 26865569 CCCATATATATGCTCACAATTGATAATTATTCTAATG... wt TEAD1;TEAD2;TEAD3;TEAD4
# | |
# A [ 218 734 212 1132 0 265 29 0 621 280 ]
# C [ 454 100 920 63 10 33 1132 1132 109 346 ]
# G [ 147 398 21 0 4 0 2 1 206 138 ]
# T [ 314 51 18 32 1132 867 24 291 511 367 ]
tbl.snp
# target.gene chromosome loc snp shoulder genome
# AQP4 chr18 26864410 rs3763040 2000 hg38
# AQP4 chr18 26865469 rs3875089 2000 hg38 A->G (reverse)
# AQP4 chr18 26855623 rs335929 2000 hg38
# AQP4 chr18 26855854 rs3763043 2000 hg38 A->G (reverse?)
# AQP4 chr18 26850565 rs9951307 2000 hg38
} # identifyMotifs
#----------------------------------------------------------------------------------------------------
buildModel <- function()
{
if(!exists("tbl.count"))
load("aqp4.tbl.counts.5kbUpstream.RData", envir=.GlobalEnv)
printf("motifs mapped to genes in motifDB: %d", length(intersect(toupper(tbl.counts$motifDB.tfs), rownames(mtx)))) # 101
tfs.oldStyle <- unique(unlist(strsplit((tbl.counts$tfs), ";")))
printf("tfs.oldStyle: %d", length(tfs.oldStyle)) # 714
printf("motifs mapped to genes old style: %d", length(intersect(tfs.oldStyle, rownames(mtx)))) # 466
tfs <- intersect(rownames(mtx), tfs.oldStyle)
solver.wt <- RandomForestSolver(mtx, targetGene="AQP4", candidateRegulators=tfs)
model.wt <- run(solver.wt)
tfs.strong <- rownames(subset(model.wt$edges, IncNodePurity > 2))
} # buildModel
#----------------------------------------------------------------------------------------------------
init.tv <- function(all.tracks=FALSE)
{
tv <- TReNA.Viz(portRange=11011:11051)
addBedTrackFromHostedFile(tv,
trackName="EncodeDHSclustered",
uri="http://pshannon.systemsbiology.net/annotations/dhsClusters_hg38.bed.gz",
index.uri="http://pshannon.systemsbiology.net/annotations/dhsClusters_hg38.bed.gz.tbi",
displayMode="SQUISHED",
color="blue")
addBedTrackFromHostedFile(tv,
trackName="AQP4 snps",
uri="http://pshannon.systemsbiology.net/annotations/aqp4-chr18-snps.bed",
index.uri=NA,
displayMode="SQUISHED",
color="red")
showRegion(tv, "chr18:26,865,340-26,865,658")
if(all.tracks){
displayAllTracks()
} # all.tracks
tv
} # init.tv
#----------------------------------------------------------------------------------------------------
runBasic <- function(chrom="chr18", start=26865462, end=26865867, min.motif.score=80L)
{
chrom="chr18"; start=26865462; end=26865867; min.motif.score=80L # short and simple, just 400bp
chrom="chr18"; start=26849820; end=26871166 # 21kb
tv <- init.tv()
mm <- MotifMatcher(name="mm", genomeName="hg38", quiet=TRUE)
tbl.regions.noSeq <- data.frame(chrom=chrom,
start=start, # min(tbl.snp$loc) - 100,
end=end, #max(tbl.snp$loc) + 5000,
stringsAsFactors=FALSE)
tbl.wt <- getSequence(mm, tbl.regions.noSeq)
printf("findMatchesByChromosomalRegion, size: %d", 1 + end - start);
x.wt <- findMatchesByChromosomalRegion(mm, tbl.regions.noSeq, pwmMatchMinimumAsPercentage=min.motif.score)
tfs.oldStyle <- unique(unlist(strsplit((x.wt$tbl$tf), ";")))
printf("tfs.oldStyle: %d", length(tfs.oldStyle)) # 714
printf("motifs mapped to genes old style: %d", length(intersect(tfs.oldStyle, rownames(mtx)))) # 466
tfs <- intersect(rownames(mtx), tfs.oldStyle)
solver.wt <- RandomForestSolver(mtx, targetGene="AQP4", candidateRegulators=tfs)
printf("calling randomForestSolver");
model.wt <- run(solver.wt)
tfs.strong <- rownames(subset(model.wt$edges, IncNodePurity > 2))
showRegion(tv, sprintf("%s:%d-%d", chrom, start, end))
#addBedTrackFromDataFrame(tv, "all motifs", x.wt$tbl[, c(2,3,4,1, 6)], color="grey", displayMode="COLLAPSED")
for(tf.strong in tfs.strong){
trackName <- sprintf("%s motifs", tf.strong)
#addBedTrackFromDataFrame(tv, trackName, x.wt$tbl[grep(tf.strong, x.wt$tbl$tf), c(2,3,4,1, 6)], color="green")
}
# expand x.wt$tbl (motifs & their tfs) to one row per motif/tf. then remove all but tfs.strong
# the resulting table should be ready for TReNA-Viz::geneRegulatoryModelToGraph
printf("----- expanding tfs")
tbl.trimmed <- subset(x.wt$tbl, nchar(tf) != 0)
tfs.split <- strsplit(tbl.trimmed$tf, ";")
length(tfs.split) # [1] 36929
counts <- unlist(lapply(tfs.split, length))
tfs.split.vec <- unlist(tfs.split)
tbl.motifs <- expandRows(tbl.trimmed, counts, count.is.col=FALSE, drop=FALSE)
checkEquals(length(tfs.split.vec), nrow(tbl.motifs))
tbl.motifs$tf <- tfs.split.vec
tbl.motifs <- subset(tbl.motifs, tf %in% tfs.strong)
tbl.motifs$distance.from.tss <- tss - tbl.motifs$motifStart
printf("----- expanding tfs, done")
count <- nrow(model.wt$edges)
tbl.model <- data.frame(tf=rownames(model.wt$edges),
randomForest=model.wt$edges$IncNodePurity,
pearson=model.wt$edges$gene.cor,
spearman=rep(0, count),
betaLasso=rep(0, count),
pcaMax=rep(0, count),
concordance=rep(0, count),
stringsAsFactors=FALSE)
tbl.model <- subset(tbl.model, randomForest >= 2)
printf("converting model to graph, %d tfs, %d motif sites", nrow(tbl.model), nrow(tbl.motifs))
system.time(g <- geneRegulatoryModelToGraph(tv, "AQP4", tbl.model, tbl.motifs))
printf("adding model layout");
system.time(g.lo <- TReNA:::addGeneModelLayout(g))
printf("addGraph")
addGraph(tv, g.lo)
loadStyle(tv, "style.js")
xyz <- 99
} # runBasic
#----------------------------------------------------------------------------------------------------
runWithAugmentedFootprints <- function()
{
start.81kb <- 26824211
end.81kb <- 26905511
chrom <- "chr18"
start.300bp <- 26865352
end.300bp <- 26865619
start.1425bp <- 26864212
end.1425bp <- 26865636
start <- start.1425bp
end <- end.1425bp
tbl.regions <- displayAllTracks(chrom, start, end)
dim(tbl.regions) # 88938 x 5
mm <- MotifMatcher(name="mm", genomeName="hg38", quiet=TRUE)
tbl.regions.noSeq <- data.frame(chrom=chrom,
start=start, # min(tbl.snp$loc) - 100,
end=end, #max(tbl.snp$loc) + 5000,
stringsAsFactors=FALSE)
tbl.wt <- getSequence(mm, tbl.regions.noSeq)
printf("findMatchesByChromosomalRegion, size: %d", 1 + end - start);
x.wt <- findMatchesByChromosomalRegion(mm, tbl.regions.noSeq, pwmMatchMinimumAsPercentage=75)
#colnames(tbl.regions) <- c("chrom", "start", "endpos",
x.fp <- findMatchesByChromosomalRegion(mm, tbl.regions, pwmMatchMinimumAsPercentage=75)
} # runWithAugmentedFootprints
#----------------------------------------------------------------------------------------------------
tead1.motifs <- unique(subset(tbl.mg, tf.gene=="TEAD1")$motif)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.