test_HumanDHSFilter.R
In trena: Fit transcriptional regulatory networks using gene expression, priors, machine learning

library(trena)
library(MotifDb)
library(RUnit)
library(RPostgreSQL)
#----------------------------------------------------------------------------------------------------
printf <- function(...) print(noquote(sprintf(...)))
mef2c.tss <- 88904257
mef2c.promoter.region <- list(chrom="chr5", start=mef2c.tss-100, end=mef2c.tss+100)
mef2c.promoter.string <- with(mef2c.promoter.region, sprintf("%s:%d-%d", chrom, start, end))
#----------------------------------------------------------------------------------------------------
runTests <- function()
{
   test_basicConstructor()
   test_getOpenChromatinFastAndSimple()

   test_getEncodeRegulatoryTableNames()
   test_checkSampleOfEncodeTables(quiet=FALSE)

   test_getRegulatoryRegions()
   test_getCandidates.emptyRegion()
   test_getCandidates.vrk2.twoRegions()
   test_getCandidates.vrk2.rs13384219.variant()

} # runTests
#----------------------------------------------------------------------------------------------------
# reuse this in several tests
create.vrk2.candidateFilterSpec <- function(promoter.length=1000)
{
   target.gene <- "VRK2"
   genome <- "hg38"
   chromosome <- "chr2"
   tss <- 57907651
   promoter.length <- promoter.length
   db.address <- system.file(package="trena", "extdata")
   genome.db.uri    <- paste("sqlite:/", db.address, "vrk2.neighborhood.hg38.gtfAnnotation.db",  sep = "/")
   tbl.regions <- data.frame(chrom="chr2", start=57906700, end=57906870, stringsAsFactors=FALSE)

   candidateFilterSpec <- list(filterType="EncodeDNaseClusters",
                               genomeName=genome,
                               encodeTableName="wgEncodeRegDnaseClustered",
                               pwmMatchPercentageThreshold=85L,
                               geneInfoDB=genome.db.uri,
                               regions=tbl.regions,
                               pfms = as.list(query(query(MotifDb, "sapiens"),"jaspar2016")),
                               variants=NA_character_)

   candidateFilterSpec

} # create.vrk2.candidateFilterSpec
#----------------------------------------------------------------------------------------------------
# reuse this in several tests
create.vrk2.candidateFilterSpec.twoRegions <- function()
{
   target.gene <- "VRK2"
   genome <- "hg38"
   chromosome <- "chr2"
   tss <- 57907651
   promoter.length <- 1000
   db.address <- system.file(package="trena", "extdata")
   genome.db.uri    <- paste("sqlite:/", db.address, "vrk2.neighborhood.hg38.gtfAnnotation.db",  sep = "/")

   tbl.regions <- data.frame(chrom=c("chr2", "chr2"),
                             start=c(57906700, 57907740),
                             end=c(57906870, 57908150),
                             stringsAsFactors=FALSE)

   cfSpec <- list(filterType="EncodeDNaseClusters",
                  genomeName=genome,
                  encodeTableName="wgEncodeRegDnaseClustered",
                  pwmMatchPercentageThreshold=85L,
                  geneInfoDB= genome.db.uri,
                  regions=tbl.regions,
                  pfms = as.list(query(query(MotifDb, "sapiens"),"jaspar2016")),
                  variants=NA_character_)

   cfSpec

} # create.vrk2.candidateFilterSpec.twoRegions
#----------------------------------------------------------------------------------------------------
#  rs13384219  A->G
#  rs13384219 at chr2:57907073-57907573
#  gtcagtagtggtggaaccagcatgc[A/G]aattagacaatgtgacttcatagcc
#  Chromosome: 2:57907323
#  vrk2 tss at chr2:57908651
create.vrk2.rs13384219.variant.candidateFilterSpec <- function(shoulder=10)
{
   spec <- create.vrk2.candidateFilterSpec()

   rs13384219.loc <- 57907323
   left.loc <- rs13384219.loc - shoulder
   right.loc <- rs13384219.loc + shoulder
   spec$regions <- data.frame(chrom="chr2", start=left.loc, end=right.loc, stringsAsFactors=FALSE)
   spec$variants <- "rs13384219"

   spec

} # create.vrk2.rs13384219.variant.candidateFilterSpec
#----------------------------------------------------------------------------------------------------
test_basicConstructor <- function(reuse=FALSE)
{
   if(!interactive()) return(TRUE)

   if(!reuse)  printf("--- test_basicConstructor")

   candidateFilterSpec <- create.vrk2.candidateFilterSpec()

   hdf <- with(candidateFilterSpec,
               HumanDHSFilter(genomeName,
                              encodeTableName=encodeTableName,
                              pwmMatchPercentageThreshold=85L,
                              geneInfoDatabase.uri=geneInfoDB,
                              pfms = as.list(query(query(MotifDb, "sapiens"),"jaspar2016")),
                              regions=candidateFilterSpec$regions))

   checkTrue(all(c("HumanDHSFilter", "CandidateFilter") %in% is(hdf)))

      # make sure an unsupported genome triggers an error
   candidateFilterSpec$genomeName <- "intentional error in genome name"

   if(!reuse)
     checkException(hdf <- with(candidateFilterSpec,
                           HumanDHSFilter(genomeName,
                              encodeTableName=encodeTableName,
                              pwmMatchPercentageThreshold=85L,
                              geneInfoDatabase.uri=geneInfoDB,
                              pfms = as.list(query(query(MotifDb, "sapiens"),"jaspar2016")),
                              regions=regions)),
                    silent = TRUE)

   if(reuse)
      return(hdf)

} # test_basicConstructor
#----------------------------------------------------------------------------------------------------
# takes > 5 seconds, bioc check
test_getEncodeRegulatoryTableNames <- function()
{
   if(!interactive()) return(TRUE)

   printf("--- test_getEncodeRegulatoryTableNames")

   candidateFilterSpec <- create.vrk2.candidateFilterSpec()
   hdf <- with(candidateFilterSpec,
               HumanDHSFilter(genomeName,
                              encodeTableName=encodeTableName,
                              pwmMatchPercentageThreshold=85L,
                              geneInfoDatabase.uri=geneInfoDB,
                              pfms = as.list(query(query(MotifDb, "sapiens"),"jaspar2016")),
                              regions=regions))

    names <- getEncodeRegulatoryTableNames(hdf)

    checkTrue(length(names) > 90)   # 96 on (18 oct 2017)

} # test_getEncodeRegulatoryTableNames
#----------------------------------------------------------------------------------------------------
test_checkSampleOfEncodeTables <- function(quiet=TRUE)
{
   if(!interactive()) return(TRUE)

   printf("--- test_checkSampleOfEncodeTables")

   candidateFilterSpec <- create.vrk2.candidateFilterSpec()

   hdf <- with(candidateFilterSpec,
               HumanDHSFilter(genomeName,
                              encodeTableName=encodeTableName,
                              #fimoDatabase.uri=fimoDB,
                              pwmMatchPercentageThreshold=85L,
                              geneInfoDatabase.uri=geneInfoDB,
                              regions=regions,
                              pfms = as.list(query(query(MotifDb, "sapiens"),"jaspar2016")),
                              quiet=TRUE))

   tableNames <- getEncodeRegulatoryTableNames(hdf)

   #  rs13384219 at chr2:57907073-57907573
   chrom <- "chr2"
   rs13384219.loc <- 57907323
   start <- rs13384219.loc - 10000
   end <- rs13384219.loc + 10000

   selectedTableNames <- tableNames[sample(1:length(tableNames), size=10)]

   for(tableName in selectedTableNames){
      #printf("tableName: %s", tableName)
      tbl <-getRegulatoryRegions(hdf, tableName, chrom, start, end)
      if(!quiet) printf("--- %s: %d rows", tableName, nrow(tbl))
      checkTrue(nrow(tbl) >= 0)
      checkEquals(colnames(tbl), c("chrom", "chromStart", "chromEnd",  "count",  "score"))
      }

} # test_checkSampleOfEncodeTables
#----------------------------------------------------------------------------------------------------
# use this sample code to poke at the encode data offered by uscs
# note that most of the tables here only serve to list, not regions, but
# metadata:  what the inputs where, where the bb (bigBed) file can be found.
# for instance, the Hotspot table has this single line:
#                                                                     fileName
# 1 /gbdb/hg38/bbi/wgEncodeRegDnase/wgEncodeRegDnaseUwA549Hotspot.broadPeak.bb
explore.ucsc.database <- function()
{
   library(RMySQL)
   driver <- MySQL()
   host <- "genome-mysql.cse.ucsc.edu"
   user <- "genome"
   dbname <- "hg38"

   db <- dbConnect(driver, user = user, host = host, dbname = dbname)
   tables <- c("wgEncodeRegDnaseClustered", "wgEncodeRegDnaseUwA549Hotspot",  "wgEncodeRegDnaseUwA549Peak")
   main.clause <- sprintf("select * from %s where", tables[1]);

   chrom <- "chr5"
   start <- 88819630
   end   <- 88835936

   query <- paste(main.clause,
                  sprintf("chrom = '%s'", chrom),
                   sprintf("and chromStart >= %d", start),
                   sprintf("and chromEnd <= %d", end),
                   collapse = " ")
   suppressWarnings(dbGetQuery(db, sprintf("select * from %s limit 5", tables[3])))


} # explore.ucsc.database
#----------------------------------------------------------------------------------------------------
test_getRegulatoryRegions <- function()
{
   if(!interactive()) return(TRUE)

   printf("--- test_getRegulatoryRegions");

   hdf <- test_basicConstructor(reuse=TRUE)

   tableNames <- getEncodeRegulatoryTableNames(hdf)
   table <- "wgEncodeRegDnaseClustered"
   checkTrue(table %in% tableNames)


      # some DHS regions, good for testing small region overlap handling
      #    chrom chromStart chromEnd score sourceCount
      # 1   chr1   88801880 88802150   573          64
      # 2   chr1   88802480 88802930   287          13
      # 3   chr1   88803000 88803270   541          60
      # 4   chr1   88811140 88811290   100           1
      # 5   chr1   88811400 88811550    68           1

     # a small region, entirely within a DHS region
   chrom = "chr1"
   start <-  88802520
   end   <-  88802530
   tbl.regions <- getRegulatoryRegions(hdf, table, chrom, start, end)
   checkEquals(nrow(tbl.regions), 1)
   checkEquals(tbl.regions$chromStart, start)
   checkEquals(tbl.regions$chromEnd, end)

     # a small region, overhanging that DHS region at its upperbound, on the right
   chrom = "chr1"
   start <-  88802145
   end   <-  88802155
   tbl.regions <- getRegulatoryRegions(hdf, table, chrom, start, end)
   checkEquals(nrow(tbl.regions), 1)
   checkEquals(tbl.regions$chromStart, start)
   checkEquals(tbl.regions$chromEnd, 88802150)

     # a small region, overhanging that DHS region at its lowerbound, on the left
   chrom = "chr1"
   start <-  88801878
   end   <-  88801883
   tbl.regions <- getRegulatoryRegions(hdf, table, chrom, start, end)
   checkEquals(nrow(tbl.regions), 1)
   checkEquals(tbl.regions$chromStart, 88801880)
   checkEquals(tbl.regions$chromEnd, end)

      # another completely contained-in-DHS-region, small area of interest
   chrom <- "chr1"
   start <- 167830160
   end   <- 167830180
   tbl.regions <- getRegulatoryRegions(hdf, table, chrom, start, end)
   checkEquals(nrow(tbl.regions), 1)
   checkEquals(tbl.regions$chromStart, start)
   checkEquals(tbl.regions$chromEnd, end)

   # the regions around the AQP4 regulatory snps

    loc <- c(26864410, 26865469, 26855623, 26855854, 26850565)
    rsids <- c("rs3763040", "rs3875089", "rs335929", "rs3763043", "rs9951307")

    tbl.snps <- data.frame(chrom=rep("chr18", 5), start=loc-1, end=loc+1, name=rsids, stringsAsFactors=FALSE)
    tbl.regions <- getRegulatoryRegions(hdf, table, "chr18",  min(loc), max(loc))

      # with these hand-picked locations we know to expect an equal or larger number of
      # open chromatin regions in the aggregating "wgEncodeRegDnaseClustered" table
    checkTrue(nrow(tbl.regions) >= length(loc))
      # all the snp locs should fall within these regions
    checkTrue(all(loc >= tbl.regions$start))
    checkTrue(all(loc <= tbl.regions$end))

} # test_getRegulatoryRegions
#----------------------------------------------------------------------------------------------------
test_getCandidates.emptyRegion <- function()
{
   if(!interactive()) return(TRUE)

    printf("--- test_getCandidates.emptyRegion")

    db.address <- system.file(package="trena", "extdata")
    genome.db.uri <- paste("sqlite:/", db.address, "vrk2.neighborhood.hg38.gtfAnnotation.db",  sep = "/")

    hdf <-  HumanDHSFilter("hg38",
                           encodeTableName="wgEncodeRegDnaseClustered",
                           pwmMatchPercentageThreshold=80L,
                           geneInfoDatabase.uri=genome.db.uri,
                           regions=data.frame(chrom="chr18", start=26850560, end=26850565, stringsAsFactors=FALSE),
                           pfms = as.list(query(query(MotifDb, "sapiens"),"jaspar2016")),
                           quiet=TRUE)
   tbl <- getCandidates(hdf)
   checkEquals(nrow(tbl), 0)

} # test_getCandidates.emptyRegion
#----------------------------------------------------------------------------------------------------
test_getCandidates.vrk2.twoRegions <- function()
{
   if(!interactive()) return(TRUE)

   printf("--- test_getCandidates.vrk2.twoRegions")

   cfSpec <- create.vrk2.candidateFilterSpec.twoRegions()
   hdf <- with(cfSpec, HumanDHSFilter(genomeName,
                                      encodeTableName=encodeTableName,
                                      pwmMatchPercentageThreshold=97L,
                                      geneInfoDatabase.uri=geneInfoDB,
                                      region=regions,
                                      pfms = as.list(query(query(MotifDb, "sapiens"),"jaspar2016")),
                                      quiet=TRUE))

   tbl <- getCandidates(hdf)
   checkEquals(colnames(tbl),
               c("motifName", "chrom", "motifStart", "motifEnd", "strand", "motifScore", "motifRelativeScore", "match",
                 "regulatoryRegionStart", "regualtoryRegionEnd", "regulatorySequence", "variant", "shortMotif"))

    # make sure all motifs fall within the specified restions
    # cfSpec$regionsSpec: [1] "chr2:57906700-57906870" "chr2:57907740-57908150"
   starts <- tbl$motifStart
   ends   <- tbl$motifEnd
   starts.in.region.1 <- intersect(which(starts >= 57906700), which(starts <= 57906870))
   starts.in.region.2 <- intersect(which(starts >= 57907740), which(starts <= 57908150))
     # did we find them all?
   checkEquals(nrow(tbl), 11)
   checkTrue(all(sort(c(starts.in.region.1, starts.in.region.2)) == seq_len(nrow(tbl))))
   checkTrue(all(ends[starts.in.region.1] <= 57906870))
   checkTrue(all(ends[starts.in.region.2] <= 57908150))

   expected.motifs <- c("Hsapiens-jaspar2016-GATA3-MA0037.2",  "Hsapiens-jaspar2016-FOXL1-MA0033.2",
                        "Hsapiens-jaspar2016-GATA5-MA0766.1",  "Hsapiens-jaspar2016-RHOXF1-MA0719.1",
                        "Hsapiens-jaspar2016-RHOXF1-MA0719.1", "Hsapiens-jaspar2016-SPI1-MA0080.1",
                        "Hsapiens-jaspar2016-SPI1-MA0080.1",   "Hsapiens-jaspar2016-ETS1-MA0098.1",
                        "Hsapiens-jaspar2016-ETS1-MA0098.1",   "Hsapiens-jaspar2016-GATA2-MA0036.1",
                        "Hsapiens-jaspar2016-GATA2-MA0036.1")

   checkTrue(all(tbl$motifName %in% expected.motifs))

   tbl.conservative <- associateTranscriptionFactors(MotifDb, tbl, source="MotifDb", expand.rows=TRUE)
   motifDb.associated.genes <- c("GATA3","FOXL1","GATA5","RHOXF1","RHOXF1","SPI1","SPI1","ETS1","ETS1","GATA2", "GATA2")
   checkEquals(sort(motifDb.associated.genes), sort(tbl.conservative$geneSymbol))


      # we used MotifDb as our source of motifs, wherein motifName is, for example,
      # Hsapiens-jaspar2016-GATA2-MA0036.1.
      # if we use TFClass mapping from motifs to transcription factor gene names, we must create
      # a new column "shortMotif" so that the motif has a form we can use as a TFclass key
   tbl$shortMotif <- unlist(lapply(strsplit(tbl$motifName, split="-"), "[", 4))
   tbl.liberal      <- associateTranscriptionFactors(MotifDb, tbl, source="TFClass", expand.rows=TRUE)
   tfClass.associated.genes <- unique(tbl.liberal$geneSymbol)
   checkEquals(length(tfClass.associated.genes), 110)

      # despite the high number of genes associated to motifs in TFClass, not all MotifDb associations
      # are found
   checkEquals(setdiff(motifDb.associated.genes, tfClass.associated.genes), c("SPI1", "ETS1"))

} # test_getCandidates.vrk2.twoRegions
#----------------------------------------------------------------------------------------------------
test_getCandidates.vrk2.rs13384219.variant <- function()
{
   if(!interactive()) return(TRUE)

   printf("--- test_getCandidates.vrk2.rs13384219.variant")
   cfSpec <- create.vrk2.rs13384219.variant.candidateFilterSpec()

   hdf.wt <- with(cfSpec, HumanDHSFilter(genomeName,
                                         encodeTableName=encodeTableName,
                                         pwmMatchPercentageThreshold=85L,
                                         geneInfoDatabase.uri=geneInfoDB,
                                         regions=regions,
                                         variants=NA_character_,
                                         pfms = as.list(query(query(MotifDb, "sapiens"),"jaspar2016")),
                                         quiet=TRUE))

   hdf.var <- with(cfSpec, HumanDHSFilter(genomeName,
                                          encodeTableName=encodeTableName,
                                          pwmMatchPercentageThreshold=85L,
                                          geneInfoDatabase.uri=geneInfoDB,
                                          regions=regions,
                                          variants=variants,
                                          pfms = as.list(query(query(MotifDb, "sapiens"),"jaspar2016")),
                                          quiet=TRUE))

   tbl.wt <- getCandidates(hdf.wt)
   tbl.var <- getCandidates(hdf.var)

   checkEquals(dim(tbl.wt),  c(53, 13))
   checkEquals(dim(tbl.var), c(45, 13))

      # many motifs survive, 9 are lost, 2 are gained
      # see trena::assessSnp for close comparison of losses and gains, wherein differing scores are reported

   checkEquals(length(intersect(tbl.wt$motifName, tbl.var$motifName)), 36)
   checkEquals(length(setdiff(tbl.wt$motifName, tbl.var$motifName)),    9)
   checkEquals(length(setdiff(tbl.var$motifName, tbl.wt$motifName)),    2)

} # test_getCandidates.vrk2.rs13384219.variant
#------------------------------------------------------------------------------------------------------------------------
# demonstrate an initially unanticipated use of this class: just get the open chromatin regions from ucsc.  a hack...
test_getOpenChromatinFastAndSimple <- function()
{
   printf("--- test_getOpenChromatinFastAndSimple")

   hdf <- HumanDHSFilter("hg38", "wgEncodeRegDnaseClustered", pwmMatchPercentageThreshold=0,
                         geneInfoDatabase.uri="bogus", regions=data.frame(), pfms=list())
   chrom <- "chr14"
   start <- 91654248
   end <-   93010778
   tbl.dhs <- getRegulatoryRegions(hdf, "wgEncodeRegDnaseClustered", chrom, start, end)

   checkEquals(colnames(tbl.dhs), c("chrom", "chromStart", "chromEnd", "count", "score"))
   checkTrue(nrow(tbl.dhs) > 1200)
   checkTrue(all(tbl.dhs$chrom == chrom))
   checkTrue(all(tbl.dhs$start >= start))
   checkTrue(all(tbl.dhs$end <= end))

} # test_getOpenChromatinFastAndSimple
#------------------------------------------------------------------------------------------------------------------------
if(!interactive())
   runTests()
Any scripts or data that you put into this service are public.
trena documentation built on Nov. 15, 2020, 2:07 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
trena
Fit transcriptional regulatory networks using gene expression, priors, machine learning

inst/unitTests/test_HumanDHSFilter.R
In trena: Fit transcriptional regulatory networks using gene expression, priors, machine learning

Try the trena package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

trena Fit transcriptional regulatory networks using gene expression, priors, machine learning

inst/unitTests/test_HumanDHSFilter.R In trena: Fit transcriptional regulatory networks using gene expression, priors, machine learning

Try the trena package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

trena
Fit transcriptional regulatory networks using gene expression, priors, machine learning

inst/unitTests/test_HumanDHSFilter.R
In trena: Fit transcriptional regulatory networks using gene expression, priors, machine learning