library(RUnit)
library(MotifDb)
#----------------------------------------------------------------------------------------------------
library(BigFimo)
#----------------------------------------------------------------------------------------------------
printf <- function(...) print(noquote(sprintf(...)))
#----------------------------------------------------------------------------------------------------
# create a new meme file:
# library(MotifDb)
# motifs <- query(MotifDb, c("sapiens","jaspar2022"))
# length(motifs) # [1] 692
# export(motifs, con="~/github/bigFimo/jaspar2022-human.meme", format="meme")
# modiffy ~/github/bigFimo/helpers/fimoProcess-hg38.R:
# if(nrow(tbl.regions) > 0){
# meme.file <- "~/github/bigFimo/jaspar2022-human.meme"
# tbl.fimo <- fimoBatch(tbl.regions, matchThreshold=fimo.threshold, genomeName="hg38", pwmFile=meme.file)
# } #
#----------------------------------------------------------------------------------------------------
runTests <- function()
{
test_ctor()
test_fimoBatchOneRegionOnly()
test_BigFimoOneRegionOnly
# test_newMemeFile()
test_subdivideRegion_1()
test_subdivideRegion_2()
test_subdivideRegion_3()
test_subdivideRegion_10()
test_subdivideBigRegion_30()
test_singleSmallRegionUsage()
test_singleHugeRegionUsage()
} # runTests
#---------------------------------------------------------------------------------------------------
# BigFimo originally (and optionally, still) depended on genehancer to provide genomic regions with
# reported transcriptional relations with the targetGene. another option, which could be inovked at
# the same time, specifies known open chromatin regions though these options are still supported,
# and ctor tests can be found in old_test_BigFimo.R, our usage has evolved towards calling FIMO only
# on an entire contiguous genomic region (perhaps +/- 1M) around a gene of interest. we make
# subsequent use of trena's FeatureTable to add in genehancer and open chromatin data, annotating
# corresponding fimo-identified regions with genehancer & open chromatin scores
test_ctor <- function()
{
message(sprintf("--- test_ctor"))
targetGene <- "ZBTB7A"
if(file.exists(targetGene)){
unlink(targetGene, recursive=TRUE)
}
# hg38, targetGene's chomLoc plus some upstream
chrom <- "chr19"
start <- 4040801
end <- 4077194
#-----------------------------------------------------------------
# partition the target region into 3 slightly overlapping parts
# overlap here is 60, with duplicates removed in later processing
#-----------------------------------------------------------------
count <- 3 # number of processes, hence also number of genomic regions
half.overlap <- 30
landmarks <- seq(from=start, to=end, length.out=count+1)
starts <- landmarks[1:count]
ends <- landmarks[2:(count+1)]
starts <- starts - half.overlap
ends <- ends + half.overlap
tbl.roi <- data.frame(chrom=chrom, start=starts, end=ends, stringsAsFactors=FALSE)
motifs <- query(MotifDb, c("sapiens"), c("jaspar2022"))
bf <- BigFimo$new(targetGene,
tbl.oc=tbl.roi, # not actually open chromatin - this includes all bases
processCount=count,
fimoThreshold=1e-6,
use.genehancer=FALSE,
gh.elite.only=FALSE,
maxGap.between.oc.and.gh=NA,
chrom=chrom, start=start, end=end,
motifs=motifs,
meme.file.path=file.path(targetGene, "motifs.meme"))
filenames.roi <- bf$createFimoTables()
bf$createMemeFile()
checkEquals(is(bf), "BigFimo")
tbl.gh <- bf$get.tbl.gh()
checkEquals(nrow(tbl.gh), 0)
} # test_ctor
#---------------------------------------------------------------------------------------------------
test_fimoBatchOneRegionOnly <- function()
{
message(sprintf("--- test_fimoBatchOneRegionOnly"))
if(Sys.info()[["nodename"]] %in% c("hagfish.local", "khaleesi.systemsbiology.net")){
source("~/github/fimoService/batchMode/fimoBatchTools.R")
} else { # assume this is a properly configured docker
source("/usr/local/scripts/fimoBatchTools.R")
}
tbl.regions <- data.frame(chrom="chr19", start=12886171, end=12886897)
fimo.threshold <- 1e-6
meme.file <- "tmp/motifs.meme"
# motifs <- query(MotifDb, c("sapiens","jaspar2022"))
motifs <- query(MotifDb, c("sapiens"),c("jaspar2024", "hocomocov13"))
length(motifs) # [1] 2078
export(motifs, con=meme.file, format="meme")
tbl.fimo <- fimoBatch(tbl.regions, matchThreshold=fimo.threshold,
genomeName="hg38", pwmFile=meme.file)
print(dim(tbl.fimo))
checkTrue(nrow(tbl.fimo) > 80) # 82 9
} # test_basicOneRegionOnly
#---------------------------------------------------------------------------------------------------
test_BigFimoOneRegionOnly <- function()
{
message(sprintf("--- test_BigFimoeOneRegionOnly"))
targetGene <- "KLF1"
if(file.exists(targetGene)){
unlink(targetGene, recursive=TRUE)
}
tbl.roi <- data.frame(chrom="chr19", start=12886171, end=12886897)
meme.file <- "tmp/motifs.meme"
motifs <- query(MotifDb, c("sapiens"),c("jaspar2024", "hocomocov13"))
length(motifs) # [1] 2078
export(motifs, con=meme.file, format="meme")
processCount <- 1
bf <- BigFimo$new(targetGene,
tbl.oc=tbl.roi, # not actually open chromatin - this includes all bases
processCount=processCount,
fimoThreshold=1e-6,
use.genehancer=FALSE,
gh.elite.only=FALSE,
maxGap.between.oc.and.gh=NA,
chrom=tbl.roi$chrom,
start=tbl.roi$start,
end=tbl.roi$end,
motifs=motifs,
meme.file.path=meme.file)
filenames.roi <- bf$createFimoTables()
checkEquals(length(filenames.roi), 1)
checkEquals(filenames.roi[1], "KLF1.01.fimoRegions-00001.RData")
#bf$createMemeFile()
checkTrue(file.exists(meme.file))
checkEquals(length(filenames.roi), processCount)
# make sure these RData files, which will be read by a script that directly
# runs FIMO, match the regions calculated above
for(i in seq_len(length(filenames.roi))){
tbl.r <- get(load(file.path(targetGene, filenames.roi[i])))
checkEquals(tbl.r, tbl.roi[i,])
}
bf$runMany()
bf$waitForCompletion(sleepInterval=1)
fimo.output.files.by.region <-
list.files(path=targetGene,
pattern=sprintf("^fimo.%s.*RData", targetGene))
checkEquals(length(fimo.output.files.by.region), processCount)
tbl.fimo <- bf$combineResults()
checkEquals(colnames(tbl.fimo), c("chrom", "start", "end", "tf", "strand", "score",
"p.value", "matched_sequence", "motif_id"))
checkTrue(nrow(tbl.fimo) > 60) # 82 currently
} # test_BigFimoeOneRegionOnly
#----------------------------------------------------------------------------------------------------
# typical use proceeds by dividing a large genomic region into slightly overlapping
# subregions, running fimo independently on each region, then combining and uniquing
# the results. test here our ability to reliably and reproducibly subdivide
# a big region
test_subdivideRegion_1 <- function()
{
message(sprintf("--- test_subdivideRegion_1"))
#------------------------------------------------
# just one small region, no subdivision to start
#------------------------------------------------
chrom <- "chr1"
start <- 100
end <- 200
number.of.regions <- 1
overlap <- 30
half.overlap <- as.integer(overlap/2)
tbl.roi <- subdivideGenomicRegion(chrom, start, end, 1, overlap)
checkEquals(tbl.roi$chrom, chrom)
checkEquals(tbl.roi$start, start - half.overlap)
checkEquals(tbl.roi$end, end + half.overlap)
checkEquals(tbl.roi$size, (2 * half.overlap) + 1 + end - start)
} # test_subdivideRegion_1
#----------------------------------------------------------------------------------------------------
test_subdivideRegion_2 <- function()
{
message(sprintf("--- test_subdivideRegion_2"))
#------------------------------------------------
# just one small region, 2 subdivisions
#------------------------------------------------
chrom <- "chr1"
start <- 100
end <- 200
number.of.regions <- 2
overlap <- 30
half.overlap <- as.integer(overlap/2)
tbl.roi <-
subdivideGenomicRegion(chrom, start, end, number.of.regions, overlap)
checkEquals(nrow(tbl.roi), number.of.regions)
checkEquals(tbl.roi$chrom, rep(chrom, number.of.regions))
#------------------------------------------------------------
# use reduce(GRanges): is the entire ranged plus shoulders,
# covered in one row?
#------------------------------------------------------------
tbl.reduced <- as.data.frame(reduce(GRanges(tbl.roi)))
checkEquals(nrow(tbl.reduced), 1)
expected.span <- (2 * half.overlap) + 1 + end - start
checkEquals(tbl.reduced$width, expected.span)
checkEquals(tbl.roi$size, c(81, 81))
} # test_subdivideRegion_2
#---------------------------------------------------------------------------------------------------
test_subdivideRegion_3 <- function()
{
message(sprintf("--- test_subdivideRegion_3"))
#------------------------------------------------
# just one small region, 3 subdivisions
#------------------------------------------------
chrom <- "chr1"
start <- 100
end <- 200
number.of.regions <- 3
overlap <- 30
half.overlap <- as.integer(overlap/2)
tbl.roi <-
subdivideGenomicRegion(chrom, start, end, number.of.regions, overlap)
checkEquals(nrow(tbl.roi), number.of.regions)
checkEquals(tbl.roi$chrom, rep(chrom, number.of.regions))
#------------------------------------------------------------
# use reduce(GRanges): is the entire ranged plus shoulders,
# covered in one row?
#------------------------------------------------------------
tbl.reduced <- as.data.frame(reduce(GRanges(tbl.roi)))
checkEquals(nrow(tbl.reduced), 1)
expected.span <- (2 * half.overlap) + 1 + end - start
checkEquals(tbl.reduced$width, expected.span)
checkEquals(tbl.roi$size, c(64, 65, 64))
} # test_subdivideRegion_3
#---------------------------------------------------------------------------------------------------
test_subdivideRegion_10 <- function()
{
message(sprintf("--- test_subdivideRegion_10"))
chrom <- "chr1"
start <- 100
end <- 200
number.of.regions <- 10
overlap <- 0
half.overlap <- as.integer(overlap/2)
tbl.roi <-
subdivideGenomicRegion(chrom, start, end, number.of.regions, overlap)
checkEquals(nrow(tbl.roi), number.of.regions)
checkEquals(tbl.roi$chrom, rep(chrom, number.of.regions))
#------------------------------------------------------------
# use reduce(GRanges): is the entire ranged plus shoulders,
# covered in one row?
#------------------------------------------------------------
tbl.reduced <- as.data.frame(reduce(GRanges(tbl.roi)))
checkEquals(nrow(tbl.reduced), 1)
expected.span <- (2 * half.overlap) + 1 + end - start
checkEquals(tbl.reduced$width, expected.span)
checkEquals(tbl.roi$size, rep(11, number.of.regions))
} # test_subdivideRegion_10
#---------------------------------------------------------------------------------------------------
test_subdivideBigRegion_30 <- function()
{
message(sprintf("--- test_subdivideBigRegion_30"))
targetGene <- "ZBTB7A" # not actually used here, just for explanation
chrom <- "chr19"
tss <- 4066900
start <- tss - 1e6
end <- tss + 1e6
number.of.regions <- 30
overlap <- 100
half.overlap <- as.integer(overlap/2)
tbl.roi <-
subdivideGenomicRegion(chrom, start, end, number.of.regions, overlap)
checkEquals(nrow(tbl.roi), number.of.regions)
checkEquals(tbl.roi$chrom, rep(chrom, number.of.regions))
#------------------------------------------------------------
# use reduce(GRanges): is the entire ranged plus shoulders,
# covered in one row?
#------------------------------------------------------------
tbl.reduced <- as.data.frame(reduce(GRanges(tbl.roi)))
checkEquals(nrow(tbl.reduced), 1)
expected.span <- (2 * half.overlap) + 1 + end - start
checkEquals(tbl.reduced$width, expected.span)
region.sizes <- tbl.roi$size
checkTrue(all(region.sizes >= 66767 & region.sizes <= 66768))
} # test_subdivideBigRegion_30
#---------------------------------------------------------------------------------------------------
test_singleSmallRegionUsage <- function()
{
message(sprintf("--- test_singleSmallRegionUsage"))
targetGene <- "ZBTB7A"
if(file.exists(targetGene)){
unlink(targetGene, recursive=TRUE)
}
# hg38, targetGene's chomLoc plus some upstream
chrom <- "chr19"
start <- 4040801
end <- 4077194
processCount <- 3 # number of processes, hence also number of genomic regions
number.of.regions <- processCount
#-----------------------------------------------------------------
# partition the target region into 3 slightly overlapping parts
# overlap here is 60, with duplicates removed in later processing
#-----------------------------------------------------------------
overlap <- 100
half.overlap <- as.integer(overlap/2)
tbl.roi <-
subdivideGenomicRegion(chrom, start, end, number.of.regions, overlap)
# quick sanity check: just one row, span slightly larger than requested
tbl.reduced <- as.data.frame(reduce(GRanges(tbl.roi)))
checkEqualsNumeric(tbl.reduced$width/(1+end-start), 1.0, tol=1e-2)
checkTrue(tbl.reduced$width/(1+end-start) > 1)
#motifs <- query(MotifDb, c("sapiens"), c("jaspar2022"))
motifs <- query(MotifDb, c("sapiens"), c("jaspar2024"))
meme.file <- file.path(targetGene, "motifs.meme")
meme.file <- "motifs.meme"
rtracklayer::export(motifs, con=meme.file, format="meme")
bf <- BigFimo$new(targetGene,
tbl.oc=tbl.roi, # not actually open chromatin - this includes all bases
processCount=processCount,
fimoThreshold=1e-5,
use.genehancer=FALSE,
gh.elite.only=FALSE,
maxGap.between.oc.and.gh=NA,
chrom=chrom, start=start, end=end,
motifs=motifs,
meme.file.path=meme.file)
filenames.roi <- bf$createFimoTables()
#bf$createMemeFile()
checkTrue(file.exists(meme.file))
checkEquals(length(filenames.roi), processCount)
# make sure these RData files, which will be read by a script that directly
# runs FIMO, match the regions calculated above
for(i in seq_len(length(filenames.roi))){
tbl.r <- get(load(file.path(targetGene, filenames.roi[i])))
checkEquals(tbl.r, tbl.roi[i,])
}
bf$runMany()
bf$waitForCompletion(sleepInterval=1)
fimo.output.files.by.region <-
list.files(path=targetGene,
pattern=sprintf("^fimo.%s.*RData", targetGene))
checkEquals(length(fimo.output.files.by.region), processCount)
tbl.fimo <- bf$combineResults()
checkEquals(colnames(tbl.fimo), c("chrom", "start", "end", "tf", "strand", "score",
"p.value", "matched_sequence", "motif_id"))
checkTrue(nrow(tbl.fimo) > 1600) # 2022 on 18 mar 2025
#--------------------------------------------------
# make sure that FIMO matches start and end very close to
# the requested chromosomal coordinates
#--------------------------------------------------
checkTrue(abs(min(tbl.fimo$start) - start) < 100)
checkTrue(abs(max(tbl.fimo$end) - end) < 100)
} # test_singleSmallRegionUsage
#----------------------------------------------------------------------------------------------------
test_singleHugeRegionUsage <- function()
{
message(sprintf("--- test_singleHUGERegionUsage"))
targetGene <- "NDUFS2"
if(file.exists(targetGene)){
unlink(targetGene, recursive=TRUE)
}
# hg38, targetGene's chomLoc plus some upstream
chrom <- "chr1"
start <- 160660231
end <- 161753478
#-----------------------------------------------------------------
# partition the target region into 3 slightly overlapping parts
# overlap here is 60, with duplicates removed in later processing
#-----------------------------------------------------------------
processCount <- 20 # number of processes, hence also number of genomic regions
number.of.regions <- processCount
overlap <- 60
half.overlap <- as.integer(overlap/2)
tbl.roi <-
subdivideGenomicRegion(chrom, start, end, number.of.regions, overlap)
tbl.reduced <- as.data.frame(reduce(GRanges(tbl.roi)))
checkEquals(nrow(tbl.reduced), 1) # ensures we have continuous coverage
checkEqualsNumeric(tbl.reduced$width/(1+end-start), 1.0, tol=1e-2)
# tbl.roi should cover just a bit more than requested
checkTrue(tbl.reduced$width/(1+end-start) > 1)
checkTrue(tbl.reduced$width/(1+end-start) < 1.01)
motifs <- query(MotifDb, c("sapiens"), c("jaspar2022"))
meme.file <- "motifs.meme"
rtracklayer::export(motifs, con=meme.file, format="meme")
bf <- BigFimo$new(targetGene,
tbl.oc=tbl.roi, # not actually open chromatin - this includes all bases
processCount=processCount,
fimoThreshold=1e-4,
use.genehancer=FALSE,
gh.elite.only=FALSE,
maxGap.between.oc.and.gh=NA,
chrom=chrom, start=start, end=end,
motifs=motifs,
meme.file.path=meme.file)
filenames.roi <- bf$createFimoTables()
checkTrue(file.exists(meme.file))
checkEquals(length(filenames.roi), processCount)
# make sure these RData files, which will be read by a script that directly
# runs FIMO, match the regions calculated above
for(i in seq_len(length(filenames.roi))){
tbl.r <- get(load(file.path(targetGene, filenames.roi[i])))
checkEquals(tbl.r, tbl.roi[i,])
}
bf$runMany()
bf$waitForCompletion(sleepInterval=10)
fimo.output.files.by.region <-
list.files(path=targetGene,
pattern=sprintf("^fimo.%s.*RData", targetGene))
checkEquals(length(fimo.output.files.by.region), processCount)
tbl.fimo <- bf$combineResults()
checkEquals(colnames(tbl.fimo), c("chrom", "start", "end", "tf", "strand", "score",
"p.value", "matched_sequence", "motif_id"))
printf("tbl.fimo: %d nrows", nrow(tbl.fimo))
checkTrue(nrow(tbl.fimo) > 200000)
checkTrue(nrow(tbl.fimo) < 300000)
#--------------------------------------------------
# make sure that FIMO matches start and end very close to
# the requested chromosomal coordinates
#--------------------------------------------------
checkTrue(abs(min(tbl.fimo$start) - start) < 100)
checkTrue(abs(max(tbl.fimo$end) - end) < 100)
} # test_singleHugeRegionUsage
#----------------------------------------------------------------------------------------------------
viz <- function()
{
igv <- start.igv("BACH1")
zoomOut(igv); zoomOut(igv);
library(ghdb)
ghdb <- GeneHancerDB()
tbl.gh <- retrieveEnhancersFromDatabase(ghdb, "BACH1", tissues="all")
dim(tbl.gh)
tbl.gh.strong <- subset(tbl.gh, elite)
dim(tbl.gh.strong)
track <- DataFrameQuantitativeTrack("gh.elite", tbl.gh.strong[, c("chrom", "start", "end", "combinedscore")],
autoscale=TRUE, color="brown")
displayTrack(igv, track)
dir <- "~/github/TrenaProjectErythropoiesis/inst/extdata/genomicRegions"
full.path <- file.path(dir, "tbl.atacMerged.RData")
stopifnot(file.exists(full.path))
tbl.atacMerged <- get(load(full.path))
roi <- getGenomicRegion(igv)
tbl.atac.sub <- subset(tbl.atacMerged, chrom==roi$chrom & start >= roi$start & end <= roi$end)
dim(tbl.atac.sub)
track <- DataFrameAnnotationTrack("atac", tbl.atac.sub[, c("chrom", "start", "end")],
color="blue", trackHeight=24)
displayTrack(igv, track)
# good test region here: "chr21:29,269,562-29,295,745"
showGenomicRegion(igv, "chr21:29,269,562-29,295,745")
} # viz
#---------------------------------------------------------------------------------------------------
if(!interactive())
runTests()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.