library(TrenaMultiScore)
library(TrenaProjectErythropoiesis)
library(TrenaProjectAD)
library(RUnit)
library(factoextra)
library(ghdb)
library(RPostgreSQL)
# library(Rtsne)
#------------------------------------------------------------------------------------------------------------------------
if(!exists("tmse")) {
message(sprintf("--- creating instance of TrenaMultiScore"))
tpe <- TrenaProjectErythropoiesis()
tpad <- TrenaProjectAD()
tmse <- TrenaMultiScore(tpe, "GATA2");
tmsa <- TrenaMultiScore(tpad, "TREM2")
}
#------------------------------------------------------------------------------------------------------------------------
runTests <- function()
{
test_constructor()
test_getTargetGeneInfo()
test_getGeneHancerRegion()
test_findOpenChromatin()
# test_findFimoTFBS() the now standard approach is to run BigFimo separately, beforehand
# test_findMoodsTFBS() FIMO is adequate, moods' contribution uncertain
test_scoreMotifHitsForConservation()
test_addDistanceToTSS()
test_scoreMotifHitsForGeneHancer()
test_addGenicAnnotations()
test_addGeneExpressionCorrelations()
test_addChIP()
test_addRnaBindingProteins()
test_add.eqtls.atTagSNP()
test_add.eqtls.big.NDUFS2()
} # runTests
#------------------------------------------------------------------------------------------------------------------------
test_constructor <- function()
{
message(sprintf("--- test_constructor"))
checkTrue("TrenaMultiScore" %in% is(tmsa))
checkTrue("TrenaProjectAD" %in% is (getProject(tmsa)))
checkTrue("TrenaMultiScore" %in% is(tmse))
checkTrue("TrenaProjectErythropoiesis" %in% is (getProject(tmse)))
} # test_constructor
#------------------------------------------------------------------------------------------------------------------------
test_getGeneHancerRegion <- function()
{
message(sprintf("--- test_getGeneHancerRegion"))
tmse <- TrenaMultiScore(tpe, "GATA2");
tbl.gata2 <- getGeneHancerRegion(tmse)
checkTrue(is.data.frame(tbl.gata2))
checkEquals(as.list(tbl.gata2), list(chrom="chr3", start=128073944, end=128683249, width=609306))
tbl.trem2 <- getGeneHancerRegion(tmsa)
checkTrue(is.data.frame(tbl.trem2))
checkEquals(as.list(tbl.trem2), list(chrom="chr6", start=41110400, end=41296400, width=186001))
tmse.fail <- TrenaMultiScore(tpe, "C2orf40");
tbl.fail <- getGeneHancerRegion(tmse.fail)
} # test_getGeneHancerRegion
#------------------------------------------------------------------------------------------------------------------------
test_lincRNA <- function()
{
name.1 <- "RP11-332H14.2"
name.2 <- "C2orf49-DT"
tpad <- TrenaProjectAD()
tms <- TrenaMultiScore(tpad, name.2)
tbl.gh <- getGeneHancerRegion(tms)
checkEquals(dim(tbl.gh), c(1, 4))
checkEquals(as.list(tbl.gh),
list(chrom="chr2", start=104598440, end=106033056,width=1434617))
} # test_lincRNA
#------------------------------------------------------------------------------------------------------------------------
test_findOpenChromatin <- function()
{
message(sprintf("--- test_findOpenChromatin"))
#--------------------------------------------------------------
# use a 1300 base region of chr19, upstream but not overlapping
# with the promoter of DOT1L
#--------------------------------------------------------------
tpe.dot1l <- TrenaProjectErythropoiesis()
tms <- TrenaMultiScore(tpe.dot1l, "DOT1L");
findOpenChromatin(tms, "chr19", 2158968, 2160275,
intersect.with.geneHancer=FALSE,
use.merged.atac=TRUE)
tbl.oc <- getOpenChromatin(tms)
checkEquals(dim(tbl.oc), c(2, 5))
#--------------------------------------------------------------
# now ask for only the oc/atac overlapping genehancer promoter
# should get nothing back
#-------------------------------------------------------------
findOpenChromatin(tms, "chr19", 2158968, 2160275,
intersect.with.geneHancer=TRUE,
use.merged.atac=TRUE)
tbl.oc <- getOpenChromatin(tms)
checkEquals(nrow(tbl.oc), 0)
#--------------------------------------------------------------------------
# repeat with 6kb region with a merged atac overlapping genehancer promoter
# no overlap obliged on first call
#------------------------------------------------------------------------
findOpenChromatin(tms, "chr19", start=2158488, end=2164650,
intersect.with.geneHancer=FALSE,
use.merged.atac=TRUE)
tbl.oc <- getOpenChromatin(tms)
checkEquals(nrow(tbl.oc), 3)
#--------------------------------------------------------------------------
# repeat with 6kb region with a merged atac overlapping genehancer promoter
# overlap obliged on second call
#------------------------------------------------------------------------
findOpenChromatin(tms, "chr19", start=2158488, end=2164650,
intersect.with.geneHancer=TRUE,
use.merged.atac=TRUE)
tbl.oc <- getOpenChromatin(tms)
checkEquals(nrow(tbl.oc), 1)
} # test_findOpenChromatin
#------------------------------------------------------------------------------------------------------------------------
slow_test_findFimoTFBS <- function()
{
message(sprintf("--- slow_test_findFimoTFBS"))
tmse <- TrenaMultiScore(tpe, "GATA2");
findOpenChromatin(tmse, "chr3", start=128483204, end=128483276)
getOpenChromatin(tmse)
findFimoTFBS(tmse)
tbl.fimo <- getMultiScoreTable(tmse)
checkEquals(ncol(tbl.fimo), 9)
checkTrue(nrow(tbl.fimo) >= 2)
checkTrue(all(c("SP2", "ZNF263") %in% tbl.fimo$tf))
findFimoTFBS(tmse, fimo.threshold=1e-3)
tbl.fimo <- getMultiScoreTable(tmse)
checkTrue(nrow(tbl.fimo) > 30)
checkTrue(all(c("SP2", "ZNF263", "CEBPB", "SP1") %in% tbl.fimo$tf)) # and many others
} # slow_test_findFimoTFBS
#------------------------------------------------------------------------------------------------------------------------
disabled_test_findMoodsTFBS <- function()
{
message(sprintf("--- test_findMoodsTFBS"))
tmse <- TrenaMultiScore(tpe, "GATA2");
findOpenChromatin(tmse, "chr3", start=128483204, end=128483276)
getOpenChromatin(tmse)
t4 <- system.time(findMoodsTFBS(tmse, moods.threshold=1e-4))[["elapsed"]]
tbl.moods.4 <- tmse@state$moods
printf("moods 1e-4: %6.1f secs, %d hits", t4, nrow(tbl.moods.4))
t3 <- system.time(findMoodsTFBS(tmse, moods.threshold=1e-3))[["elapsed"]]
tbl.moods.3 <- tmse@state$moods
printf("moods 1e-3: %6.1f secs, %d hits", t3, nrow(tbl.moods.3))
t2 <- system.time(findMoodsTFBS(tmse, moods.threshold=1e-2))[["elapsed"]]
tbl.moods.2 <- tmse@state$moods
printf("moods 1e-2: %6.1f secs, %d hits", t2, nrow(tbl.moods.2))
t6 <- system.time(findMoodsTFBS(tmse, moods.threshold=1e-6))[["elapsed"]]
tbl.moods.6 <- tmse@state$moods
printf("moods 1e-6: %6.1f secs, %d hits", t6, nrow(tbl.moods.6))
checkEquals(nrow(tbl.moods.6), 0)
} # disabled_test_findMoodsTFBS
#------------------------------------------------------------------------------------------------------------------------
test_scoreMotifHitsForConservation <- function()
{
message(sprintf("--- test_scoreMotifHitsForConservation"))
tmse <- TrenaMultiScore(tpe, "GATA2");
findOpenChromatin(tmse, "chr3", start=128481000, end=128489000)
findFimoTFBS(tmse, fimo.threshold=1e-5)
tbl.fimo <- getMultiScoreTable(tmse)
checkEquals(ncol(tbl.fimo), 9)
checkTrue(nrow(tbl.fimo) >= 5)
scoreMotifHitsForConservation(tmse)
tbl <- getMultiScoreTable(tmse)
checkEquals(ncol(tbl), 11)
checkTrue(nrow(tbl) >= 5)
#checkTrue(all(c("phast7", "phast30", "phast100") %in% colnames(tbl)))
checkTrue(all(c("phast7", "phast100") %in% colnames(tbl)))
checkEqualsNumeric(mean(tbl$phast7), 0.6, tolerance=0.05)
#checkEqualsNumeric(mean(tbl$phast30), 0.66, tolerance=0.05)
checkEqualsNumeric(mean(tbl$phast100), 0.77, tolerance=0.05)
} # test_scoreMotifHitsForConservation
#------------------------------------------------------------------------------------------------------------------------
test_getTargetGeneInfo <- function()
{
message(sprintf("--- test_getTargetGeneInfo"))
tmse <- TrenaMultiScore(tpe, "GATA2");
x <- getTargetGeneInfo(tmse)
checkTrue(all(c("tss", "strand", "chrom", "start", "end") %in% names(x)))
checkEquals(x$tss, 128493185)
checkEquals(x$strand, -1)
} # test_getTargetGeneInfo
#------------------------------------------------------------------------------------------------------------------------
test_addDistanceToTSS <- function()
{
message(sprintf("--- test_addDistanceToTSS"))
tmse <- TrenaMultiScore(tpe, "GATA2");
tss <- getTargetGeneInfo(tmse)$tss
shoulder <- 10000
findOpenChromatin(tmse, "chr3", start=tss-(2*shoulder), end=tss+shoulder)
findFimoTFBS(tmse, fimo.threshold=1e-5)
tbl.fimo <- getMultiScoreTable(tmse)
checkEquals(ncol(tbl.fimo), 9)
checkTrue(nrow(tbl.fimo) >= 15)
scoreMotifHitsForConservation(tmse)
addDistanceToTSS(tmse)
tbl <- getMultiScoreTable(tmse)
# (31 jan 2020: with R-devel 4.0, we have an off-by-one difference
# when compared to R 3.6. not sure why. skate around this for now.
checkEqualsNumeric(head(sort(tbl$tss)),
c(-4770, -4769, -4769, -4768, -4768, -4768),
tolerance=1)
} # test_addDistanceToTSS
#------------------------------------------------------------------------------------------------------------------------
test_scoreMotifHitsForGeneHancer <- function()
{
message(sprintf("--- test_scoreMotifHitsForGeneHancer"))
tss <- getTargetGeneInfo(tmse)$tss
shoulder <- 100000
findOpenChromatin(tmse, "chr3", start=tss-(2*shoulder), end=tss+shoulder)
findFimoTFBS(tmse, fimo.threshold=1e-6)
tbl.fimo <- getMultiScoreTable(tmse)
# checkEquals(dim(tbl.fimo), c(5, 9))
scoreMotifHitsForGeneHancer(tmse)
tbl <- getMultiScoreTable(tmse)
gh.values <- unique(tbl$gh)
# when a motif is within the full range reported by genehancer, but does
# not fall within a genehancer region, then it gets a zero score. check that
# also check to see that multiple non-zero scores were picked up
checkTrue(0 %in% gh.values)
checkTrue(length(unique(gh.values)) >= 3)
} # test_scoreMotifHitsForGeneHancer
#------------------------------------------------------------------------------------------------------------------------
test_addGenicAnnotations <- function()
{
message(sprintf("--- test_addGenicAnnotations"))
findOpenChromatin(tmse, "chr3", start=128481000, end=128489000)
getOpenChromatin(tmse)
findFimoTFBS(tmse, fimo.threshold=1e-5)
addDistanceToTSS(tmse)
scoreMotifHitsForConservation(tmse)
addGenicAnnotations(tmse)
tbl.tms <- getMultiScoreTable(tmse)
dim(tbl.tms)
checkEquals(ncol(tbl.tms), 14)
checkTrue(all(c("annot.type", "annot.symbol") %in% colnames(tbl.tms)))
checkTrue(nrow(tbl.tms) >= 5)
} # test_addGenicAnnotations
#------------------------------------------------------------------------------------------------------------------------
test_addChIP <- function()
{
message(sprintf("--- test_addChip"))
findOpenChromatin(tmse, "chr3", start=128481000, end=128489000)
tbl.oc <- getOpenChromatin(tmse)
checkEquals(nrow(tbl.oc), 8)
findFimoTFBS(tmse, fimo.threshold=1e-5)
tbl.tms <- getMultiScoreTable(tmse)
checkEquals(ncol(tbl.tms), 9)
checkTrue(nrow(tbl.tms) >= 5)
addChIP(tmse)
tbl.tms <- getMultiScoreTable(tmse)
checkEquals(ncol(tbl.tms), 10)
checkTrue(nrow(tbl.tms) >= 5)
checkTrue(nrow(subset(tbl.tms, chip)) >= 3)
} # test_addChIP
#------------------------------------------------------------------------------------------------------------------------
test_addGeneExpressionCorrelations <- function()
{
message(sprintf("--- test_addGeneExpressionCorrelations"))
f <- system.file(package="TrenaMultiScore", "extdata", "tbl.fimo.hoxb4-1e5.RData")
checkTrue(file.exists(f))
tbl.fimo.boxb4 <- get(load(f))
checkEquals(length(sort(unique(tbl.fimo.hoxb4$tf))), 106)
tms.hoxb4 <- TrenaMultiScore(tpe, "HOXB4", tbl.fimo=tbl.fimo.hoxb4);
mtx <- getExpressionMatrix(tpe, "brandLabDifferentiationTimeCourse-27171x28")
#----------------------------------------------------
# first, correlate all columns using default pearson
#----------------------------------------------------
addGeneExpressionCorrelations(tms.hoxb4, mtx)
tbl.tms <- getMultiScoreTable(tms.hoxb4)
set.seed(17)
random.tfs <- sample(sort(unique(tbl.tms$tf)), size=3)
random.tfs <- intersect(random.tfs, rownames(mtx))
for(this.tf in random.tfs){
expected <- cor(mtx["HOXB4", ],
mtx[this.tf,], method="pearson", use="pairwise.complete.obs")
actual <- subset(tbl.tms, tf==this.tf)$cor
checkTrue(all(actual == actual[1])) # all should be the same, calculated only once
# correlation is rounded to two decimal digits
#printf("%s expected: %f actual: %f", this.tf, expected, actual[1])
checkEqualsNumeric(expected, actual[1], tolerance=0.1)
} # for this.tf
#----------------------------------------------------
# now, correlate all columns using spearman
#----------------------------------------------------
addGeneExpressionCorrelations(tms.hoxb4, mtx, method="spearman")
tbl.tms <- getMultiScoreTable(tms.hoxb4)
for(this.tf in random.tfs){
expected <- cor(mtx["HOXB4", ],
mtx[this.tf,], method="spearman", use="pairwise.complete.obs")
actual <- subset(tbl.tms, tf==this.tf)$cor
checkTrue(all(actual == actual[1])) # all should be the same, calculated only once
# correlation is rounded to two decimal digits
#printf("%s expected: %f actual: %f", this.tf, expected, actual[1])
checkEqualsNumeric(expected, actual[1], tolerance=0.1)
} # for this.tf
timepoints <- c("day0.r1", "day0.r2", "day2.r1", "day2.r2", "day4.r1", "day4.r2")
addGeneExpressionCorrelations(tms.hoxb4, mtx, "cor.early", timepoints, method="spearman")
tbl.tms <- getMultiScoreTable(tms.hoxb4)
checkTrue(all(c("cor", "cor.early") %in% colnames(tbl.tms)))
# choose 10 tfs at random for checking
set.seed(17)
tfs <- sample(unique(tbl.tms$tf), 10)
tfs.usable <- intersect(tfs, rownames(mtx))
tfs.noExpression <- setdiff(tfs, rownames(mtx))
for(this.tf in tfs.usable){
expected <- cor(mtx[this.tf, ], mtx["HOXB4",], method="spearman")
actual <- subset(tbl.tms, tf==this.tf)$cor[1]
checkEqualsNumeric(expected, actual, tol=0.1)
expected.sub <- cor(mtx[this.tf, timepoints],
mtx["HOXB4", timepoints],
method="spearman")
actual <- subset(tbl.tms, tf==this.tf)$cor.early[1]
checkEqualsNumeric(expected.sub, actual, tol=0.1)
}
checkTrue(all(is.na(subset(tbl.tms, tf %in% tfs.noExpression)$cor)))
checkTrue(all(is.na(subset(tbl.tms, tf %in% tfs.noExpression)$cor.early)))
} # test_addGeneExpressionCorrelations
#------------------------------------------------------------------------------------------------------------------------
test_addRnaBindingProteins <- function()
{
message(sprintf("--- test_addRnaBindingProteins"))
tms.gata2 <- TrenaMultiScore(TrenaProjectErythropoiesis(), "GATA2", quiet=TRUE);
checkTrue(is.null(getRnaBindingProteins(tms.gata2)))
tbl.rbp <- addRnaBindingProteins(tms.gata2)
checkEquals(dim(tbl.rbp), c(2390, 12))
# now retrieve it again, without a fresh query
tbl.v2 <- getRnaBindingProteins(tms.gata2)
checkEquals(dim(tbl.rbp), c(2390, 12))
} # test_addRnaBindingProteins
#------------------------------------------------------------------------------------------------------------------------
slow_test_erythropoeisis.hoxb4 <- function()
{
message(sprintf("--- slow_test_erythropoeisis.hoxb4"))
tms.hoxb4 <- TrenaMultiScore(tpe, "HOXB4");
getGeneHancerRegion(tms.hoxb4)
findOpenChromatin(tms.hoxb4)
findFimoTFBS(tms.hoxb4, fimo.threshold=1e-3)
scoreMotifHitsForConservation(tms.hoxb4)
scoreMotifHitsForGeneHancer(tms.hoxb4)
addDistanceToTSS(tms.hoxb4)
mtx <- getExpressionMatrix(tpe, "brandLabDifferentiationTimeCourse-27171x28")
addGeneExpressionCorrelations(tms.hoxb4, mtx)
addGenicAnnotations(tms.hoxb4)
tbl <- getMultiScoreTable(tms.hoxb4)
tbl.sub.neg <- subset(tbl, fimo_pvalue < 0.0001 & phast100 > 0.8 & cor < -0.4 & gh > 0)
checkTrue(all(c("RXRG", "MXI1", "VSX2","IRF1") %in% tbl.sub.neg$tf))
tbl.sub.pos <-
subset(tbl, fimo_pvalue < 0.00001 & phast100 > 0.8 & cor > 0.8 & gh > 0)
checkTrue(all(c("MYC", "ZNF263") %in% tbl.sub.pos$tf))
} # slow_test_erythropoeisis.hoxb4
#------------------------------------------------------------------------------------------------------------------------
# both haney and krumsiek report that GATA1 regulates TAL1 (aka SCL)
# can we see that? brandLabDifferentiationTimeCourse-27171x28 shows 0.75 correlation
slow_test_erythropoeisis.tal1 <- function()
{
message(sprintf("--- slow_test_erythropoeisis.tal1"))
tms.tal1 <- TrenaMultiScore(tpe, "TAL1");
getGeneHancerRegion(tms.tal1) # 190kb
findOpenChromatin(tms.tal1)
findFimoTFBS(tms.tal1, fimo.threshold=1e-3)
scoreMotifHitsForConservation(tms.tal1)
scoreMotifHitsForGeneHancer(tms.tal1)
addDistanceToTSS(tms.tal1)
mtx <- getExpressionMatrix(tpe, "brandLabDifferentiationTimeCourse-27171x28")
addGeneExpressionCorrelations(tms.tal1, mtx)
addGenicAnnotations(tms.tal1)
addChIP(tms.tal1)
tbl <- getMultiScoreTable(tms.tal1)
tbl$cor[which(is.na(tbl$cor))] <- 0
tbl$motifScore <- round(-log10(tbl$p.value), 2)
dim(tbl)
cor.spread <- fivenum(abs(tbl$cor))
cor.threshold <- cor.spread[4]
cor.threshold <- 0.9 * cor.spread[5]
tbl <- subset(tbl, abs(cor) >= cor.threshold)
dim(tbl)
checkTrue(nrow(tbl) > 80) # 90 x 18 (26 aug 2020)
tfoi <- unique(tbl$tf)
length(tfoi)
tblc <- data.frame(row.names=tfoi)
tblc$cor <- as.numeric(lapply(tfoi, function(TF) mean(abs(subset(tbl, tf==TF)$cor))))
tblc$gh <- as.numeric(lapply(tfoi, function(TF) mean(subset(tbl, tf==TF)$gh)))
tblc$count <- as.numeric(lapply(tfoi, function(TF) nrow(subset(tbl, tf==TF))))
tblc$chip <- as.numeric(lapply(tfoi, function(TF) sum(subset(tbl, tf==TF)$chip)))
tblc$fimo <- as.numeric(lapply(tfoi, function(TF) mean(subset(tbl, tf==TF)$motifScore)))
tblc$phast <- as.numeric(lapply(tfoi, function(TF) mean(as.numeric(as.matrix(subset(tbl, tf==TF)[, c("phast7", "phast100")])))))
#tblc$phast <- as.numeric(lapply(tfoi, function(TF) mean(as.numeric(as.matrix(subset(tbl, tf==TF)[, c("phast7", "phast30", "phast100")])))))
#tblc$tss <- as.numeric(lapply(tfoi, function(TF) mean(abs(subset(tbl, tf==TF)$tss))))
mtx <- as.matrix(tblc)
dim(mtx) # 5 x 6
pca <- prcomp(mtx, scale=TRUE)
fviz_eig(pca)
#--------------------------------------------------
# explore similarities among the tfs selected above
# 5 which have the best mRNA correlation to TAL1
#--------------------------------------------------
fviz_pca_ind(pca,
col.ind = "cos2", # Color by the quality of representation
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE # Avoid text overlapping
)
fviz_pca_var(pca,
col.var = "contrib", # Color by contributions to the PC
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE # Avoid text overlapping
)
fviz_pca_biplot(pca, repel = TRUE,
col.var = "#2E9FDF", # Variables color
col.ind = "#696969" # Individuals color
)
} # slow_test_erythropoeisis.tal1
#------------------------------------------------------------------------------------------------------------------------
slow_test_erythropoeisis.gata2 <- function()
{
message(sprintf("--- slow_test_erythropoeisis.gata2"))
tms.gata2 <- TrenaMultiScore(tpe, "GATA2");
region <- getGeneHancerRegion(tms.gata2)
with(region, 1 + end - start) # 548kb
findOpenChromatin(tms.gata2)
motifs <- query(MotifDb, "sapiens", c("jaspar2018", "hocomocov11-core"))
findFimoTFBS(tms.gata2, motifs=motifs, fimo.threshold=1e-3)
scoreMotifHitsForConservation(tms.gata2)
scoreMotifHitsForGeneHancer(tms.gata2)
addDistanceToTSS(tms.gata2)
mtx <- getExpressionMatrix(tpe, "brandLabDifferentiationTimeCourse-27171x28")
addGeneExpressionCorrelations(tms.gata2, mtx)
addGenicAnnotations(tms.gata2)
addChIP(tms.gata2)
tbl <- getMultiScoreTable(tms.gata2)
table(tbl$chip)
tbl$cor[which(is.na(tbl$cor))] <- 0
tbl$motifScore <- round(-log10(tbl$p.value), 2)
dim(tbl) # 416 x 18
cor.spread <- fivenum(abs(tbl$cor))
cor.threshold <- cor.spread[3]
#cor.threshold <- 0.7 * cor.spread[5]
tbl <- subset(tbl, abs(cor) >= cor.threshold)
dim(tbl) # 223 x 18
tfoi <- unique(tbl$tf)
length(tfoi) # 60
tblc <- data.frame(row.names=tfoi)
#tblc$absCor <- as.numeric(lapply(tfoi, function(TF) mean(abs(subset(tbl, tf==TF)$cor))))
#tblc$cor <- as.numeric(lapply(tfoi, function(TF) mean(subset(tbl, tf==TF)$cor)))
tblc$gh <- round(as.numeric(lapply(tfoi, function(TF) mean(subset(tbl, tf==TF)$gh))),0)
tblc$count <- as.numeric(lapply(tfoi, function(TF) nrow(subset(tbl, tf==TF))))
tblc$chip <- round(as.numeric(lapply(tfoi, function(TF) mean(subset(tbl, tf==TF)$chip))),0)
tblc$fimo <- round(as.numeric(lapply(tfoi, function(TF) mean(subset(tbl, tf==TF)$motifScore))),2)
tblc$phast <- round(as.numeric(lapply(tfoi,
function(TF) mean(as.numeric(as.matrix(subset(tbl, tf==TF)[, c("phast7", "phast100")]))))),2)
#tblc$phast <- round(as.numeric(lapply(tfoi,
# function(TF) mean(as.numeric(as.matrix(subset(tbl, tf==TF)[, c("phast7", "phast30", "phast100")]))))),2)
#tblc$tss <- as.numeric(lapply(tfoi, function(TF) mean(abs(subset(tbl, tf==TF)$tss))))
mtx <- as.matrix(tblc)
pca <- prcomp(mtx, scale=TRUE)
library(factoextra)
fviz_eig(pca)
fviz_pca_ind(pca,
col.ind = "cos2", # Color by the quality of representation
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = FALSE # Avoid text overlapping
)
fviz_pca_var(pca,
col.var = "contrib", # Color by contributions to the PC
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE # Avoid text overlapping
)
fviz_pca_biplot(pca, repel = TRUE,
col.var = "red", # #2E9FDF", # Variables color
col.ind = "#696969" # Individuals color
)
} # slow_test_erythropoeisis.gata2
#------------------------------------------------------------------------------------------------------------------------
# takes a long time to run with fimo.threshold of 1e-3. skip unless you are patient.
slow_test_AD.trem2 <- function()
{
message(sprintf("--- slow_test_AD.trems"))
tms.trem2 <- TrenaMultiScore(tpad, "TREM2");
getGeneHancerRegion(tms.trem2)
findOpenChromatin(tms.trem2)
findFimoTFBS(tms.trem2, fimo.threshold=1e-3) # 146k hits!
addChIP(tms.trem2)
scoreMotifHitsForConservation(tms.trem2)
scoreMotifHitsForGeneHancer(tms.trem2)
addGenicAnnotations(tms.trem2)
addDistanceToTSS(tms.trem2)
mtx <- get(load("~/github/TrenaProjectAD/inst/extdata/expression/mayo.tcx.16969x262.covariateCorrection.log+scale.RData"))
addGeneExpressionCorrelations(tms.trem2, mtx)
tbl.raw <- getMultiScoreTable(tms.trem2)
dim(tbl.raw)
tbl <- subset(tbl.raw, chip | p.value <= 1e-4)
dim(tbl)
length(unique(tbl$tf))
table(tbl$chip)
tbl$cor[which(is.na(tbl$cor))] <- 0
tbl$motifScore <- round(-log10(tbl$p.value), 2)
dim(tbl)
hist(abs(tbl$cor))
tbl <- subset(tbl, abs(cor) >= 0.1)
dim(tbl)
tfoi <- unique(tbl$tf)
length(tfoi)
tblc <- data.frame(row.names=tfoi)
tblc$cor <- as.numeric(lapply(tfoi, function(TF) mean(abs(subset(tbl, tf==TF)$cor))))
tblc$gh <- as.numeric(lapply(tfoi, function(TF) mean(subset(tbl, tf==TF)$gh)))
tblc$count <- as.numeric(lapply(tfoi, function(TF) nrow(subset(tbl, tf==TF))))
#tblc$chip <- as.numeric(lapply(tfoi, function(TF) subset(tbl, tf==TF)$chip))
tblc$chip <- as.integer(lapply(tfoi, function(TF) sum(subset(tbl, tf==TF)$chip)))
tblc$fimo <- as.numeric(lapply(tfoi, function(TF) mean(subset(tbl, tf==TF)$motifScore)))
tblc$phast <- as.numeric(lapply(tfoi, function(TF) mean(as.numeric(as.matrix(subset(tbl, tf==TF)[, c("phast7", "phast100")])))))
#tblc$phast <- as.numeric(lapply(tfoi, function(TF) mean(as.numeric(as.matrix(subset(tbl, tf==TF)[, c("phast7", "phast30", "phast100")])))))
#tblc$tss <- as.numeric(lapply(tfoi, function(TF) mean(abs(subset(tbl, tf==TF)$tss))))
tblc$count[tblc$count > 10] <- 10
tblc$chip[tblc$chip > 10] <- 10
tblc.trimmed <- subset(tblc, cor > 0.32 | chip > 0 | fimo > 5)
mtx <- as.matrix(tblc.trimmed[, c("chip", "count", "cor")])
pca <- prcomp(mtx, scale=TRUE)
fviz_eig(pca)
fviz_pca_ind(pca,
col.ind = "cos2", # Color by the quality of representation
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE # Avoid text overlapping
)
fviz_pca_var(pca,
col.var = "contrib", # Color by contributions to the PC
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE # Avoid text overlapping
)
fviz_pca_biplot(pca, repel = FALSE,
col.var = "RED", # Variables color
col.ind = "#696969" # Individuals color
)
# m2 <- mtx[rownames(subset(as.data.frame(pca$x), PC1 > 0 & PC2 > 0)),]
# m2[order(m2[,"cor"], decreasing=TRUE),]
# chip count cor
# IKZF1 4 13 0.57
# CEBPA 13 18 0.53
# ESR2 0 23 0.51
# ESR1 21 63 0.49
# SPI1 67 78 0.47
# FLI1 39 51 0.46
# RUNX1 10 14 0.46
# SP2 0 26 0.45
# SP3 0 33 0.43
# ERG 18 23 0.43
# ELF1 18 27 0.41
# ETS1 5 14 0.41
# GABPA 17 22 0.38
# PRDM1 3 20 0.38
# RFX5 4 19 0.37
# SP1 29 129 0.33
# CREB1 5 15 0.32
} # slow_test_AD.trem2
#------------------------------------------------------------------------------------------------------------------------
slow_test_AD.mef2c <- function()
{
message(sprintf("--- slow_test_AD.trems"))
tms.mef2c <- TrenaMultiScore(tpad, "MEF2C");
getGeneHancerRegion(tms.mef2c)
findOpenChromatin(tms.mef2c)
tbl.oc <- getOpenChromatin(tms.mef2c)
printf("base count for fimo: %d", sum(with(tbl.oc, 1 + end-start)))
system.time(findFimoTFBS(tms.mef2c, fimo.threshold=1e-7))
system.time(findFimoTFBS(tms.mef2c, fimo.threshold=1e-6))
system.time(findFimoTFBS(tms.mef2c, fimo.threshold=1e-5))
system.time(findFimoTFBS(tms.mef2c, fimo.threshold=1e-4))
system.time(findFimoTFBS(tms.mef2c, fimo.threshold=1e-3)) # 5 minutes
addChIP(tms.mef2c)
scoreMotifHitsForConservation(tms.mef2c)
scoreMotifHitsForGeneHancer(tms.mef2c)
addGenicAnnotations(tms.mef2c)
addDistanceToTSS(tms.mef2c)
mtx.new <- get(load("~/github/TrenaProjectAD/inst/extdata/expression/mayo.tcx.16969x262.covariateCorrection.log+scale.RData"))
mtx.old <- getExpressionMatrix(tpad, "temporalCortex.15167x264")
addGeneExpressionCorrelations(tms.mef2c, mtx.new)
tbl.raw <- getMultiScoreTable(tms.mef2c)
dim(tbl.raw)
save(tbl.raw, file="mef2c.fimo1e-3.boca.RData")
dim(subset(tbl.raw, chip | p.value <= 1e-5))
tbl <- subset(tbl.raw, chip | p.value <= 1e-5)
dim(tbl)
length(unique(tbl$tf))
table(tbl$chip)
tbl$cor[which(is.na(tbl$cor))] <- 0
tbl$motifScore <- round(-log10(tbl$p.value), 2)
dim(tbl)
hist(abs(tbl$cor))
spread <- fivenum(abs(tbl$cor))
tbl <- subset(tbl, abs(cor) >= spread[4])
dim(tbl)
tfoi <- unique(tbl$tf)
length(tfoi)
tblc <- data.frame(row.names=tfoi)
tblc$cor <- as.numeric(lapply(tfoi, function(TF) mean(abs(subset(tbl, tf==TF)$cor))))
tblc$gh <- as.numeric(lapply(tfoi, function(TF) mean(subset(tbl, tf==TF)$gh)))
tblc$count <- as.numeric(lapply(tfoi, function(TF) nrow(subset(tbl, tf==TF))))
#tblc$chip <- as.numeric(lapply(tfoi, function(TF) subset(tbl, tf==TF)$chip))
tblc$chip <- as.integer(lapply(tfoi, function(TF) sum(subset(tbl, tf==TF)$chip)))
tblc$fimo <- as.numeric(lapply(tfoi, function(TF) mean(subset(tbl, tf==TF)$motifScore)))
tblc$phast <- as.numeric(lapply(tfoi, function(TF) mean(as.numeric(as.matrix(subset(tbl, tf==TF)[, c("phast7", "phast100")])))))
#tblc$phast <- as.numeric(lapply(tfoi, function(TF) mean(as.numeric(as.matrix(subset(tbl, tf==TF)[, c("phast7", "phast30", "phast100")])))))
#tblc$tss <- as.numeric(lapply(tfoi, function(TF) mean(abs(subset(tbl, tf==TF)$tss))))
remove.mef2c <- grep("MEF2C", rownames(tblc))
if(length(remove.mef2c) > 0)
tblc <- tblc[-remove.mef2c,]
dim(tblc)
#tblc$count[tblc$count > 10] <- 10
#tblc$chip[tblc$chip > 10] <- 10
fivenum(tblc$cor)
tblc.trimmed <- subset(tblc, gh > -1 & (cor > 0.22 | chip > 0) & fimo > 3)
dim(tblc.trimmed)
#tblc.trimmed <- subset(tblc, gh > 0 & (cor > 0.22 | chip > 0 | fimo > 5))
new.order <- with(tblc.trimmed, order(cor, count, fimo, chip, decreasing=TRUE))
tblc.trimmed <- tblc.trimmed[new.order,]
dim(tblc.trimmed)
mtx.pca <- as.matrix(tblc.trimmed[, c("chip", "count", "cor", "gh")])
pca <- prcomp(mtx.pca[1:30,], scale=TRUE)
fviz_eig(pca)
fviz_pca_ind(pca,
col.ind = "cos2", # Color by the quality of representation
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE # Avoid text overlapping
)
fviz_pca_var(pca,
col.var = "contrib", # Color by contributions to the PC
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE # Avoid text overlapping
)
fviz_pca_biplot(pca, repel = FALSE,
col.var = "RED", # Variables color
col.ind = "#696969" # Individuals color
)
save(tbl, file="mef2c.feature.table.RData")
} # slow_test_AD.mef2c
#------------------------------------------------------------------------------------------------------------------------
# chrom start end tf strand score p.value matched_sequence chip phast7 phast30 phast100 gh annot.type annot.symbol tss cor motif_id motifScore
# 24798 chr5 88667670 88667684 ZBTB33 - 10.53930 5.99e-05 CTCCCGCGGTCTCGA FALSE 1.00 1.00 1.00 7.39 introns,promoters LINC00461,MIR9-2 236587 0.54 Hsapiens-jaspar2018-ZBTB33-MA0527.1 4.22
# 39561 chr5 88680491 88680505 ZBTB33 + 11.01120 4.76e-05 CTCTCTCGGGAGCTC FALSE 0.00 0.02 0.00 0.00 introns,1to5kb LINC00461,MEF2C-AS2 223766 0.54 Hsapiens-jaspar2018-ZBTB33-MA0527.1 4.32
# 39935 chr5 88680553 88680567 ZBTB33 - 9.83146 8.39e-05 GCCCCGCGCGGGCCC FALSE 0.37 0.35 0.55 0.00 introns,1to5kb LINC00461,MEF2C-AS2 223704 0.54 Hsapiens-jaspar2018-ZBTB33-MA0527.1 4.08
# 40524 chr5 88680674 88680688 ZBTB33 - 10.65170 5.68e-05 TTCCCGCGGCCTCCC FALSE 0.00 0.00 0.00 0.00 introns,1to5kb LINC00461,MEF2C-AS2 223583 0.54 Hsapiens-jaspar2018-ZBTB33-MA0527.1 4.25
# 47241 chr5 88685169 88685183 ZBTB33 - 9.76404 8.65e-05 GTCCCGCGGCGCTAG FALSE 0.78 0.78 0.36 0.00 introns,promoters LINC00461,MEF2C-AS2 219088 0.54 Hsapiens-jaspar2018-ZBTB33-MA0527.1 4.06
# 51217 chr5 88691107 88691121 ZBTB33 - 9.55056 9.53e-05 GGCTCGCGGTGCTAC FALSE 0.26 0.32 0.26 0.00 introns,promoters MEF2C-AS2,LINC00461 213150 0.54 Hsapiens-jaspar2018-ZBTB33-MA0527.1 4.02
# 91511 chr5 88872971 88872985 ZBTB33 + 9.56180 9.48e-05 CTCTCTCGAGAGTGT FALSE 0.05 0.00 0.00 0.00 introns MEF2C 31286 0.54 Hsapiens-jaspar2018-ZBTB33-MA0527.1 4.02
# 94139 chr5 88883227 88883241 ZBTB33 - 12.11240 2.69e-05 CCCTCGCGCGCGCGC FALSE 0.05 0.55 0.60 604.42 introns,promoters,5UTRs,exons MEF2C,MEF2C-AS1 21030 0.54 Hsapiens-jaspar2018-ZBTB33-MA0527.1 4.57
# 94263 chr5 88883240 88883254 ZBTB33 - 9.48315 9.83e-05 CTCGCGCGCGCTCCC FALSE 0.13 0.09 0.55 604.42 introns,promoters,5UTRs,exons MEF2C,MEF2C-AS1 21017 0.54 Hsapiens-jaspar2018-ZBTB33-MA0527.1 4.01
# 94289 chr5 88883242 88883256 ZBTB33 - 13.59550 1.15e-05 CCCTCGCGCGCGCTC FALSE 0.12 0.08 0.59 604.42 introns,promoters,5UTRs,exons MEF2C,MEF2C-AS1 21015 0.54 Hsapiens-jaspar2018-ZBTB33-MA0527.1 4.94
# 94684 chr5 88883256 88883270 ZBTB33 - 7.04494 2.74e-04 GCCCCGCGCCCCCCC TRUE 0.07 0.46 0.51 604.42 introns,promoters,5UTRs,exons MEF2C,MEF2C-AS1 21001 0.54 Hsapiens-jaspar2018-ZBTB33-MA0527.1 3.56
# 94846 chr5 88883274 88883288 ZBTB33 - 4.28090 7.54e-04 GGCGCGCGCGAATGC TRUE 0.00 0.84 0.46 604.42 introns,promoters,5UTRs,exons MEF2C,MEF2C-AS1 20983 0.54 Hsapiens-jaspar2018-ZBTB33-MA0527.1 3.12
# 94865 chr5 88883277 88883291 ZBTB33 + 5.25843 5.36e-04 TTCGCGCGCGCCGAG TRUE 0.00 0.87 0.40 604.42 introns,promoters,5UTRs,exons MEF2C,MEF2C-AS1 20980 0.54 Hsapiens-jaspar2018-ZBTB33-MA0527.1 3.27
# 95861 chr5 88883476 88883490 ZBTB33 - 4.95506 5.97e-04 TTTACGCGATGTTTC TRUE 1.00 1.00 1.00 604.42 introns,promoters,exons MEF2C,MEF2C-AS1 20781 0.54 Hsapiens-jaspar2018-ZBTB33-MA0527.1 3.22
# 97116 chr5 88883656 88883670 ZBTB33 + 4.57303 6.82e-04 GCTTCGCGCTGCCTT TRUE 0.33 0.95 0.00 604.42 introns,promoters,5UTRs,exons MEF2C,MEF2C-AS1 20601 0.54 Hsapiens-jaspar2018-ZBTB33-MA0527.1 3.17
# 97343 chr5 88883715 88883729 ZBTB33 - 6.08989 3.95e-04 AGCTCGCGAGGAAAG TRUE 0.03 0.32 0.62 604.42 introns,promoters,5UTRs,exons MEF2C,MEF2C-AS1 20542 0.54 Hsapiens-jaspar2018-ZBTB33-MA0527.1 3.40
# 97347 chr5 88883718 88883732 ZBTB33 + 9.88764 8.17e-05 TCCTCGCGAGCTGGA TRUE 0.05 0.50 0.63 604.42 introns,promoters,5UTRs,exons MEF2C,MEF2C-AS1 20539 0.54 Hsapiens-jaspar2018-ZBTB33-MA0527.1 4.09
# 98820 chr5 88884089 88884103 ZBTB33 + 15.42700 3.44e-06 CTCTCGCGGCGCCTC FALSE 0.00 0.00 0.00 604.42 introns,promoters,1to5kb MEF2C,MEF2C-AS1 20168 0.54 Hsapiens-jaspar2018-ZBTB33-MA0527.1 5.46
#
# mtx["ZBTB33",]
# chip count cor gh
# 7.0000 18.0000 0.5400 369.7783
#
test_mef2cModel <- function()
{
goi <- "ZBTB33"
igv <- start.igv("MEF2C")
tbl.gh <- getEnhancers(tpad)
tbl.features <- get(load("mef2c.feature.table.RData"))
with(tbl.gh, showGenomicRegion(igv, sprintf("%s:%d-%d", chrom[1], min(start)-1000, max(end)+ 1000)))
track <- DataFrameQuantitativeTrack("gh", tbl.gh[, c("chrom", "start", "end", "combinedscore")], color="brown",
autoscale=FALSE, min=0, max=20)
displayTrack(igv, track)
tbl.goifimo <- subset(tbl.features, tf==goi)[, c("chrom", "start", "end")]
track <- DataFrameAnnotationTrack("ZBTB33.fimo", tbl.goifimo, color="darkBlue", trackHeight=25)
displayTrack(igv, track)
tbl.chip <- with(tbl.gh, getChipSeq(tpad, chrom[1], min(start)-1000, max(end)+1000, tfs=goi))
track <- DataFrameAnnotationTrack("ChIP", tbl.chip[, c("chrom", "start", "end")], color="darkGreen")
displayTrack(igv, track)
} # test_mef2cModel
#------------------------------------------------------------------------------------------------------------------------
test_add.eqtls.atTagSNP <- function()
{
message(sprintf("--- test_add.eqtls.atTagSNP"))
targetGene <- "NDUFS2"
tpad <- TrenaProjectAD()
f <- system.file(package="TrenaMultiScore", "extdata", "tbl.fimo.1e3.512bp.ndufs2.RData")
checkTrue(file.exists(f))
tbl.fimo <- get(load(f))
#--------------------------------------------------------------
# a very simple case to start, related to AD, just the tag.snp
# 33 base pair span, continaing 32 motifs hits
#--------------------------------------------------------------
tbl.fimo.small <- subset(tbl.fimo, start > 161185590 & end < 161185625)
dim(tbl.fimo.small)
fimo.min <- min(tbl.fimo.small$start)
fimo.max <- max(tbl.fimo.small$end)
fimo.chrom <- tbl.fimo.small$chrom[1]
printf("--- fimo span: %d bases", (fimo.max - fimo.min))
f <- system.file(package="TrenaProjectAD", "extdata", "genomicRegions", "boca-hg38-consensus-ATAC.RData")
checkTrue(file.exists(f))
tbl.oc <- get(load(f))
f <- system.file(package="TrenaMultiScore", "extdata", "tbl.eqtl.ndufs2.small.RData")
checkTrue(file.exists(f))
tbl.eqtl <- get(load(f))
tms <- TrenaMultiScore(tpad, targetGene, tbl.fimo.small, tbl.oc, quiet=FALSE)
#--------------------------------------------------------------------------------
# filter the eqtls, producing 10 reports of the tag.snp, different pvalue & beta
#--------------------------------------------------------------------------------
tbl.eqtl.01 <- subset(tbl.eqtl, hg38 >= fimo.min &
hg38 <= fimo.max &
gene=="NDUFS2" &
grepl("Brain", study, ignore.case=TRUE) &
pvalue < 0.01)
dim(tbl.eqtl.01)
new.order <- order(abs(tbl.eqtl.01$beta), decreasing=TRUE)
tbl.eqtl.01 <- tbl.eqtl.01[new.order,]
dim(tbl.eqtl.01) # 10 9
#----------------------------------------------------
# further filtering will be done in this method call
#----------------------------------------------------
add.eQTLs(tms, tbl.eqtl.01, pval=0.001, abs.beta=0.2, eqtl.title="brain.eqtl")
tbl.tms <- getMultiScoreTable(tms)
# 17 tfbs intersected eQTLs above threshold in 4 studies
checkEquals(length(which(tbl.tms$brain.eqtl==4)), 17)
checkEquals(length(which(tbl.tms$brain.eqtl==0)), 15)
checkEquals(sort(subset(tbl.eqtl.01, pvalue<=0.001 & abs(beta) >= 0.2)$study),
c("GTEx_V8.Brain_Caudate_basal_ganglia",
"GTEx_V8.Brain_Cerebellum",
"GTEx_V8.Brain_Hippocampus",
"GTEx_V8.Brain_Spinal_cord_cervical_c-1"))
as.list(table(tbl.tms$brain.eqtl))
fimo.regions.with.eqtl <- which(tbl.tms$brain.eqtl > 0)
fimo.regions.without.eqtl <- which(tbl.tms$brain == 0)
checkTrue(all(is.na(tbl.tms$ctx.rsid[fimo.regions.without.eqtl])))
checkEquals(length(fimo.regions.with.eqtl) + length(fimo.regions.without.eqtl), nrow(tbl.tms))
variants.passing.threshold <- nrow(subset(tbl.eqtl.01, pvalue <= 0.001 & abs(beta) >= 0.2))
checkEquals(variants.passing.threshold, 4)
#--------------------------------------------------------------------------------
# find eqtls hits for GTEx_V8.Brain_Cerebellum eQTLS, at the specified thresholds
#--------------------------------------------------------------------------------
tbl.eqtl.02 <- subset(tbl.eqtl, hg38 >= fimo.min &
hg38 <= fimo.max &
gene=="NDUFS2" &
grepl("GTEx_V8.Brain_Cerebellum", study, ignore.case=TRUE))
checkEquals(nrow(tbl.eqtl.02), 1)
add.eQTLs(tms, tbl.eqtl.02, pval=0.001, abs.beta=0.2, eqtl.title="cerebellum.eqtl")
tbl.tms <- getMultiScoreTable(tms)
checkEquals(length(which(tbl.tms$cerebellum.eqtl==1)), 17)
checkEquals(length(which(tbl.tms$cerebellum.eqtl==0)), 15)
if(FALSE){
igv <- start.igv("NDUFS2", "hg38")
tbl.track <- unique(tbl.sub[, c("chrom", "hg38", "hg38", "rsid")])
colnames(tbl.track)[2:3] <- c("start", "end")
tbl.track$start <- tbl.track$start - 1
track <- DataFrameAnnotationTrack("tbl.sub", tbl.track, color="red")
displayTrack(igv, track)
track <- DataFrameAnnotationTrack("fimo", tbl.fimo, color="darkblue")
displayTrack(igv, track)
tbl.hit <- subset(tbl.tms, eqtl.gtex.ctx > 0)
track <- DataFrameAnnotationTrack("hits", tbl.hit, color="darkGreen")
displayTrack(igv, track)
}
} # test_add.eqtls.atTagSNP
#------------------------------------------------------------------------------------------------------------------------
# two constrasting snps within 50 bp
# 1) rs34884997 1 161191082 2.07339e-10 NDUFS2 418 -0.403569 GTEx_V8.Brain_Cerebellum
# 2) rs41270037 1 161191043 7.80822e-02 NDUFS2 350 0.317322 GTEx_V8.Brain_Cerebellar_Hemisphere
#
test_add.eqtls.big.NDUFS2 <- function()
{
message(sprintf("--- test_add.eqtls.big.NDUFS2"))
targetGene <- "NDUFS2"
tbl.fimo <- get(load("../extdata/tbl.fimo.1e3.512bp.ndufs2.RData"))
tbl.oc <- get(load("~/github/TrenaProjectAD/inst/extdata/genomicRegions/boca-hg38-consensus-ATAC.RData"))
tbl.eqtl <- get(load("../extdata/tbl.eqtl.ndufs2.small.RData"))
dim(tbl.eqtl)
tbl.eqtl.oi <- subset(tbl.eqtl, gene=="NDUFS2" & grepl("GTEx_V8.Brain_C", study))
dim(tbl.eqtl.oi) # 138
tbl.tagHap <- read.table("~/github/ADvariantExplorer/explore/adamts4-study/haploreg-rs4575098-0.2.tsv",
sep="\t", as.is=TRUE, header=TRUE)
viz <- function(){ # used for selecting 4997 and 0037, chr1:161,191,018-161,191,117
igv <- start.igv("NDUFS2", "hg38")
zoomOut(igv)
zoomOut(igv)
track <- DataFrameAnnotationTrack("rs34884997",
data.frame(chrom=1, start=161191081, end=161191082, name="rs34884997",
stringsAsFactors=FALSE), color="darkblue")
displayTrack(igv, track)
tbl.track <- tbl.tagHap[, c("chrom", "hg38", "hg38", "rSquared")]
colnames(tbl.track)[2:3] <- c("start", "end")
tbl.track$start <- tbl.track$start - 1
track <- DataFrameQuantitativeTrack("tagHap", tbl.track, autoscale=TRUE, color="darkgreen")
displayTrack(igv, track)
tbl.track <- tbl.eqtl.oi[, c("chrom", "hg38", "hg38", "rsid")]
colnames(tbl.track)[2:3] <- c("start", "end")
tbl.track$start <- tbl.track$start -1
track <- DataFrameAnnotationTrack("eQTL", tbl.track, color="black", displayMode="COLLAPSE")
displayTrack(igv, track)
length(unique(tbl.fimo$tf)) # 350
track <- DataFrameAnnotationTrack("fimo", tbl.fimo, color="red", displayMode="COLLAPSE")
displayTrack(igv, track)
} # viz
tpad <- TrenaProjectAD()
loc.start <- 161191018
loc.end <- 161191117
f <- "~/github/ADvariantExplorer/explore/adamts4-study/ndufs2/tbl.fimo.NDUFS2.RData"
file.exists(f)
tbl.fimo <- get(load(f))
tbl.fimo.small <- subset(tbl.fimo, start >= loc.start & end <= loc.end)
dim(tbl.fimo.small) # 293 9
save(tbl.fimo.small, file="../extdata/tbl.fimo.1e3.100bp-rs34884997.ndufs2.RData")
fimo.min <- min(tbl.fimo.small$start)
fimo.max <- max(tbl.fimo.small$end)
fimo.chrom <- tbl.fimo.small$chrom[1]
printf("--- fimo span: %d bases", (fimo.max - fimo.min))
tms <- TrenaMultiScore(tpad, targetGene, tbl.fimo.small, tbl.oc, quiet=FALSE)
#--------------------------------------------------------------------------------
# filter the eqtls, producing 10 reports of the tag.snp, different pvalue & beta
#--------------------------------------------------------------------------------
tbl.eqtl.01 <- subset(tbl.eqtl, hg38 >= fimo.min &
hg38 <= fimo.max &
gene=="NDUFS2" &
grepl("GTEx_V8.Brain_C", study, ignore.case=TRUE))
dim(tbl.eqtl.01)
new.order <- order(abs(tbl.eqtl.01$pvalue), decreasing=FALSE)
tbl.eqtl.01 <- tbl.eqtl.01[new.order,]
dim(tbl.eqtl.01) # 8 9
#----------------------------------------------------
# further filtering will be done in this method call
#----------------------------------------------------
add.eQTLs(tms, tbl.eqtl.01, pval=0.1, abs.beta=0.15, eqtl.title="gtex.brain_c")
tbl.tms <- getMultiScoreTable(tms)
table(tbl.tms$gtex.brain_c) # 0 1 4
# 221 42 30
#---------------------------------------------------
# we started with two variants, separated by about 38bp
# hg38 pval beta
# 1) rs41270037 1 161191043 7.80822e-02 NDUFS2 350 0.317322 GTEx_V8.Brain_Cerebellar_Hemisphere
# 2) rs34884997 1 161191082 2.07339e-10 NDUFS2 418 -0.403569 GTEx_V8.Brain_Cerebellum
# the first, a weaker eQTL, only appears in one GTEx brain study
# the second, much stronger, appears in multiple GTEx brain studies
# we see that in the table result above. let's now check the coordinates of those 42
# motifs which intersect the first variant
tbl.variant.1 <- as.data.frame(reduce(GRanges(subset(tbl.tms, gtex.brain_c==1)[, c("chrom", "start", "end")])))
tbl.variant.2 <- as.data.frame(reduce(GRanges(subset(tbl.tms, gtex.brain_c==4)[, c("chrom", "start", "end")])))
checkTrue(161191043 %in% tbl.variant.1$start:tbl.variant.1$end)
checkTrue(161191082 %in% tbl.variant.2$start:tbl.variant.2$end)
starts.1 <- subset(tbl.tms, gtex.brain_c==1)$start
ends.1 <- subset(tbl.tms, gtex.brain_c==1)$end
tbl.1 <- subset(tbl.tms, gtex.brain_c==1)[, c("start", "end")]
checkTrue(all(unlist(lapply(seq_len(nrow(tbl.1)),
function(r) 161191043 %in% tbl.1$start[r]: tbl.1$end[r]))))
tbl.2 <- subset(tbl.tms, gtex.brain_c==4)[, c("start", "end")]
checkTrue(all(unlist(lapply(seq_len(nrow(tbl.2)),
function(r) 161191082 %in% tbl.2$start[r]: tbl.2$end[r]))))
} # test_add.eqtls.big.NDUFS2
#----------------------------------------------------------------------------------------------------
explore.add.eqtls <- function()
{
print(load("add.eqtls.tmp.RData")) # "tbl.fimo" "tbl.eqtls" "tbl.ov"
dim(tbl.fimo) # 1260 12
tbl.eqtls <- subset(tbl.eqtls, gene=="NDUFS2")
dim(tbl.eqtls) # 178 9
dim(tbl.ov) # 88 2
length(unique(tbl.ov[,1])) # 23 eqtls
length(unique(tbl.ov[,2])) # in 38 fimo regions
tbl.counts.per.fimo.region <- as.data.frame(table(tbl.ov[,2]))
tbl.fimo$eqtlCount[tbl.counts.per.fimo.region[,1]] <- tbl.counts.per.fimo.region[,2]
} # explore.add.eqtls
#------------------------------------------------------------------------------------------------------------------------
demo.ccl1.tpe <- function()
{
printf("--- demo.ccl1.tpe")
tpe <- TrenaProjectErythropoiesis()
tms <- TrenaMultiScore(tpe, "CCL1");
tbl.gh <- getGeneHancerRegion(tms) # 240k
with(tbl.gh, findOpenChromatin(tms, chrom, start, end))
tbl.oc <- getOpenChromatin(tms)
tbl.corces.paul <- get(load("~/github/TrenaProjectErythropoiesis/misc/diffBind/corces/tbl.corces.hg38.scoredByPaul.RData"))
tbl.corces.rory <- get(load("~/github/TrenaProjectErythropoiesis/misc/diffBind/corces/tbl.diffBind.rory.hg38.day0-4.RData"))
tbl.corces.paul.ccl1 <- subset(tbl.corces.paul, chrom==tbl.gh$chrom & start >= tbl.gh$start & end <= tbl.gh$end)
tbl.corces.rory.ccl1 <- subset(tbl.corces.rory, chrom==tbl.gh$chrom & start >= tbl.gh$start & end <= tbl.gh$end)
tbl.corces.rory.ccl1$Fold <- -1 * tbl.corces.rory.ccl1$Fold
dim(tbl.corces.paul.ccl1)
dim(tbl.corces.rory.ccl1)
igv <- start.igv("CCL1")
track <- DataFrameQuantitativeTrack("corces.paul", tbl.corces.paul.ccl1[, c(1,2,3,13)],
color="red", autoscale=TRUE)
displayTrack(igv, track)
track <- DataFrameQuantitativeTrack("corces.rory", tbl.corces.rory.ccl1[, c(1,2,3,9)],
color="red", autoscale=TRUE)
displayTrack(igv, track)
ghdb <- GeneHancerDB()
tbl.gh.cd34 <- retrieveEnhancersFromDatabase(ghdb, "CCL1",
tissues="Common myeloid progenitor CD34+")
dim(tbl.gh.cd34) # 5 16
track <- DataFrameQuantitativeTrack("GH.cd34", tbl.gh.cd34[, c(1,2,3,11)],
color="random", autoscale=FALSE, min=0, max=20)
displayTrack(igv, track)
tbl.gh.all <- retrieveEnhancersFromDatabase(ghdb, "CCL1", tissues="all")
dim(tbl.gh.all) # 18 16
track <- DataFrameQuantitativeTrack("GH.all", tbl.gh.all[, c(1,2,3,11)],
color="brown", autoscale=FALSE, min=0, max=20)
displayTrack(igv, track)
#------------------------------------------------------------
# add another track, the +/- ratio of day0 vs day2 reads.
# day2 > day0 reads
#------------------------------------------------------------
day0.mean <- as.numeric(lapply(seq_len(nrow(tbl.corces.paul.ccl1)),
function(i) mean(as.numeric(tbl.corces.paul.ccl1[i, 4:7]))))
day2.mean <- as.numeric(lapply(seq_len(nrow(tbl.corces.paul.ccl1)),
function(i) mean(as.numeric(tbl.corces.paul.ccl1[i, 8:9]))))
avg0 <- day0.mean/day2.mean
avg2 <- day2.mean/day0.mean
avg0[is.infinite(avg0)] <- 0
avg2[is.infinite(avg2)] <- 0
score2 <- rep(0, nrow(tbl.corces.paul.ccl1))
score2[which(avg0 > 1)] <- -avg0[avg0 > 1]
score2[which(avg2 > 1)] <- avg2[avg2 > 1]
score2 <- round(score2, digits=2)
tbl.corces.paul.ccl1$score2 <- score2
track <- DataFrameQuantitativeTrack("score2", tbl.corces.paul.ccl1[, c("chrom", "start", "end", "score2")],
color="blue", autoscale=FALSE,
min=-10, max=30)
displayTrack(igv, track)
#------------------------------------------------------------
# bring in gene expression data
#------------------------------------------------------------
f <- system.file(package="TrenaProjectErythropoiesis", "extdata", "expression",
"brandLabDifferentiationTimeCourse-27171x28.RData")
file.exists(f)
mtx.rna <- get(load(f))
cor.all <- lapply(rownames(mtx.rna), function(gene)
cor(mtx.rna[gene, 1:4], mtx.rna["CCL1", 1:4]))
names(cor.all) <- rownames(mtx.rna)
fivenum(as.numeric(cor.all))
# -0.68529152 0.06930292 0.26896765 0.48405971 0.89141070
hist(as.numeric(cor.all), main="correlation of CCL1 againstall")
#------------------------------------------------------------
# add ChIP indiscriminately
#------------------------------------------------------------
tbl.chip <- with(tbl.gh, getChipSeq(tpe, chrom[1], min(start)-1000, max(end)+1000))
tfs <- names(rev(sort(table(tbl.chip$tf))))
for(tf.this in tfs){
if(tf.this %in% names(cor.all)){
cor.tf <- cor.all[[tf.this]]
if(abs(cor.tf) > 0.95){
printf("tf %s: %f", tf.this, cor.tf)
color <- "red"
if(cor.tf < 0)
color <- "blue"
tbl.tf <- subset(tbl.chip, tf==tf.this)
track <- DataFrameAnnotationTrack(tf.this, tbl.tf[, c(1,2,3)], color=color,,
displayMode="COLLAPSED", trackHeight=23)
displayTrack(igv, track)
} # if abs
} # tf in cor
} # for tf.this
} # demo.ccl1.tpe
#------------------------------------------------------------------------------------------------------------------------
demo.ndufs2.tpad <- function()
{
message(sprintf("--- demo.ndufs2.tpad"))
targetGene <- "NDUFS2"
tpad <- TrenaProjectAD()
tbl.fimo <- get(load(system.file(package="TrenaMultiScore", "extdata", "ndufs2-fimo3-2kb-continuous.RData")))
dim(tbl.fimo)
tbl.oc <- get(load(system.file(package="TrenaMultiScore", "extdata", "ndufs2-hint16-footprints-2kb-RData")))
dim(tbl.oc)
checkEquals(colnames(tbl.oc), c("chrom", "start", "end"))
tms <- TrenaMultiScore(tpad, targetGene, tbl.fimo, tbl.oc, quiet=FALSE)
mtx.rna <- get(load("~/github/TrenaProjectAD/inst/extdata/expression/mayo.tcx.16969x262.covariateCorrection.log+scale.RData"))
scoreMotifHitsForOpenChromatin(tms)
scoreMotifHitsForConservation(tms)
scoreMotifHitsForGeneHancer(tms)
addDistanceToTSS(tms)
addGenicAnnotations(tms)
addChIP(tms)
addGeneExpressionCorrelations(tms, mtx.rna)
tbl.tms <- getMultiScoreTable(tms)
dim(tbl.tms) # 5274 18
checkTrue(mean(tbl.tms$gh) > 0)
checkEquals(as.numeric(as.list(table(tbl.tms$oc))), c(1667, 3607)) # FALSE, TRUE
checkEquals(as.numeric(as.list(table(tbl.tms$chip))), c(4352, 922)) # FALSE, TRUE
igv <- start.igv(targetGene, "hg38")
zoomOut(igv)
library(ghdb)
ghdb <- GeneHancerDB()
tbl.gh <- retrieveEnhancersFromDatabase(ghdb, targetGene, tissues="all")
tbl.gh$score <- asinh(tbl.gh$combinedscore)
track <- DataFrameQuantitativeTrack("GH", tbl.gh[, c("chrom", "start", "end", "score")],
autoscale=TRUE, color="brown")
displayTrack(igv, track)
print(load("~/github/TrenaProjectAD/explore/mayo-epigenetics/atac/earlyResultMayoWholeGenome.RData"))
colnames(tbl.dba)[1] <- "chrom"
tbl.dba$chrom <- as.character(tbl.dba$chrom)
roi <- getGenomicRegion(igv)
roi
tbl.track <- subset(tbl.dba, chrom==roi$chrom & start >= roi$start & end <= roi$end)
dim(tbl.track)
track <- DataFrameAnnotationTrack("atac", tbl.track, color="brown")
displayTrack(igv, track)
tag.snp <- "rs4575098"
ld.snp <- "rs11585858" # 0.98 LD with 5098
require("EndophenotypeExplorer")
etx <- EndophenotypeExplorer$new(targetGene, "hg38")
rsid.list <- c(tag.snp, ld.snp)
tbl.locs <- etx$rsidToLoc(rsid.list)
tbl.track <- tbl.locs[, c("chrom", "hg38", "hg38")]
colnames(tbl.track) <- c("chrom", "start", "end")
tbl.track$start <- tbl.track$start - 1
track <- DataFrameAnnotationTrack("snps", tbl.track, "red", trackHeight=24)
displayTrack(igv, track)
tbl.sub <- subset(tbl.tms, abs(cor) > 0.75)
dim(tbl.sub)
tfs <- unique(tbl.sub$tf)
length(tfs) # 8
for(TF in tfs){
tbl.track <- subset(tbl.sub, tf==TF)[, c("chrom", "start", "end", "p.value")]
tbl.track$score <- -log10(tbl.track$p.value)
tbl.track <- tbl.track[, c("chrom", "start", "end", "score")]
track <- DataFrameQuantitativeTrack(TF, tbl.track, autoscale=TRUE, color="random")
displayTrack(igv, track)
} # for TF
} # demo.ndfus2.tpad
#------------------------------------------------------------------------------------------------------------------------
if(!interactive())
runTests()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.