inst/unitTests/test_MotifDb.R

library(MotifDb)
library(RUnit)
library(seqLogo)
#----------------------------------------------------------------------------------------------------
printf <- function(...) print(noquote(sprintf(...)))
#----------------------------------------------------------------------------------------------------
runTests = function()
{
  test.emptyCtor()
  test.nonEmptyCtor()
  test.MotifDb.normalMode()
  test.MotifDb.emptyMode()
  test.allMatricesAreNormalized()
  test.providerNames()
  test.geneSymbols()
  test.geneIdsAndTypes()
  test.proteinIds()
  test.sequenceCount()
  test.longNames()
  test.organisms()
  test.bindingDomains()
  test.experimentTypes()
  test.tfFamilies()
  test.bindingSequences()
  test.flyBindingDomains()
  test.pubmedIDs()
  #test.allFullNames()
  test.subset()
  test.subsetWithVariables()
  test.queryOldStyle()
  test.query()
  test.transformMatrixToMemeRepresentation()
  test.matrixToMemeText()
  test.export_memeFormatStdOut()
  test.export_memeFormatToFile()
  test.export_memeFormatToFileDuplication()
  test.export_memeFormatToFile_run_tomtom()
  test.flyFactorGeneSymbols()
  test.export_jasparFormatStdOut()
  test.export_jasparFormatToFile()

  test.geneToMotif()
  test.geneToMotif.ignore.jasparSuffixes()
  test.geneToMotif.oneGene.noMotifs
  test.motifToGene()
  test.associateTranscriptionFactors()

  test.hocomoco11.with.reliabilityScores()

} # runTests
#------------------------------------------------------------------------------------------------------------------------
test.emptyCtor = function()
{
  print('--- test.emptyCtor')
  motif.list = MotifDb:::MotifList()
  checkEquals(length(motif.list), 0)

} # test.emptyCtor
#------------------------------------------------------------------------------------------------------------------------
test.nonEmptyCtor = function()
{
  print('--- test.nonEmptyCtor')
  mtx = matrix(runif(20), nrow=4, ncol=5, byrow=T, dimnames=list(c('A', 'C', 'G', 'T'), as.character(1:5)))
  mtx.normalized = apply(mtx, 2, function(colvector) colvector / sum(colvector))
  matrixList = list(mtx.normalized)

  tbl.md = data.frame(providerName='',
                       providerId='',
                       dataSource='',
                       geneSymbol='',
                       geneId='',
                       geneIdType='',
                       proteinId='',
                       proteinIdType='',
                       organism='',
                       sequenceCount='',
                       bindingSequence='',
                       bindingDomain='',
                       tfFamily='',
                       experimentType='',
                       pubmedID='',
                       stringsAsFactors=FALSE)
  names(matrixList) = 'test'
  rownames(tbl.md) = 'test'
  motif.list = MotifDb:::MotifList(matrixList, tbl.md)
  checkEquals(length(motif.list), 1)

} # test.nonEmptyCtor
#------------------------------------------------------------------------------------------------------------------------
# 'normal' in that all included data sources are already loaded
test.MotifDb.normalMode = function()
{
  print('--- test.MotifDb.normalMode')

  mdb = MotifDb # (quiet=TRUE)
    #(5 jun 2012)
    # JASPAR_CORE: 459
    # ScerTF: 196
    # UniPROBE: 380
    # FlyFactorSurvey: 614
    # hPDI: 437
  checkTrue(length(mdb) > 2080)

} # test.MotifDb.normalMode
#------------------------------------------------------------------------------------------------------------------------
# this mode is not intended for users, but may see use in the future.
test.MotifDb.emptyMode = function()
{
  print('--- test.MotifDb.emptyMode')

  mdb = MotifDb:::.MotifDb(loadAllSources=FALSE, quiet=TRUE)
  checkTrue(length(mdb) == 0)

} # test.MotifDb.emptyMode
#------------------------------------------------------------------------------------------------------------------------
# 3 jaspar matrices arrive with organism = NA.  find and fix.  make sure here.
# NA-JASPAR_CORE-TBP-MA0108.2: JASPAR gives <NA> for speciesID
# NA-JASPAR_CORE-HNF4A-MA0114.1: JASPAR gives <NA> for speciesID
# NA-JASPAR_CORE-CEBPA-MA0102.2: JASPAR gives '-' for speciesID, website says 'vertebrates'

# Many more NA's exist...need to fix these; here's a quick fix for now

test.noNAorganisms = function()

{
  print('--- test.noNAorganisms')
  #checkEquals(which(is.na(mcols(MotifDb)$organism)), integer(0))

  # There's a fair number of NA organisms, mostly due to including the homer DB
  checkEquals(sum(is.na(mcols(MotifDb)$organism)), 366)

} # test.noNAorganisms
#------------------------------------------------------------------------------------------------------------------------
test.allMatricesAreNormalized = function()
{
  print('--- test.allMatricesAreNormalized')
  mdb = MotifDb#(quiet=TRUE)
  matrices = mdb@listData
    # a lenient test required by "Cparvum-UniPROBE-Cgd2_3490.UP00395" and  "Hsapiens-UniPROBE-Sox4.UP00401"
    # for reasons not yet explored.  10e-8 should be be possible
  checkTrue(all(sapply(matrices, function(m) all(abs(colSums(m) - 1.0) < 0.02))))

} # test.allMatricesAreNormalized
#------------------------------------------------------------------------------------------------------------------------
test.providerNames = function()
{
  print('--- test.getProviderNames')
  mdb = MotifDb #()
  pn = mcols(mdb)$providerName
  checkEquals(length(which(is.na(pn))), 0)
  checkEquals(length(which(pn == '')), 0)

} # test.providerNames
#------------------------------------------------------------------------------------------------------------------------
test.geneSymbols = function()
{
  print('--- test.getGeneSymbols')
  mdb = MotifDb #()
  syms = mcols(mdb)$geneSymbol
  checkEquals(length(which(is.na(syms))), 683)  # no symols yet for the dgf stamlab motifs
  checkEquals(length(which(syms == '')), 0)

} # test.geneSymbols
#------------------------------------------------------------------------------------------------------------------------
test.geneIdsAndTypes = function()
{
  print('--- test.getGeneIdsAndTypes')
  mdb = MotifDb
  tbl <- mcols(mdb)
  geneIds = tbl$geneId
  geneIdTypes = tbl$geneIdType
  typeCounts = as.list(table(geneIdTypes))

  checkTrue(typeCounts$ENTREZ > 2300)
  checkTrue(typeCounts$FLYBASE >= 45)
  checkTrue(typeCounts$SGD >= 600)
  checkTrue(nrow(subset(tbl, is.na(geneIdType))) > 2000)

  empty.count = length(which(geneIds == ''))
  checkEquals(empty.count, 0)


} # test.geneIdsAndTypes
#------------------------------------------------------------------------------------------------------------------------
# make sure that all proteinIds have explicit values, either proper identifiers or NA
# currently tested by looking for empty string assignments
test.proteinIds = function()
{
  print('--- test.proteinIds')
  mdb = MotifDb #(quiet=TRUE)
  NA.string.count <- sum(is.na(mcols(mdb)$proteinId))
#  NA.string.count = length(grep('NA', mcols(mdb)$proteinId))

  checkEquals(NA.string.count, 2514)
  # FIX THIS; Currently 2514 don't have protein IDs
  #checkEquals(NA.string.count, 0)

  empty.count = length(which(mcols(mdb)$proteinId==""))
  if(empty.count > 0)
    browser('test.proteinIds')

  checkEquals(empty.count, 0)

     # FlyFactorSurvey, as digested by me, had a blanket assigment of UNIPROT to all proteinIds
     # Herve' pointed out that this applied also to entries with no proteinId.
     # make sure this is fixed

  ### FIX THIS TOO! Currently have 913 entries with a proteinIdType and no proteinId
  x = mcols(mdb)
  # checkEquals(nrow(subset(x, !is.na(proteinIdType) & is.na(proteinId))), 0)


} # test.proteinIds
#------------------------------------------------------------------------------------------------------------------------
# only for UniPROBE do we not have sequence count.  might be possible to get them along with 'insertion sequences'
test.sequenceCount = function()
{
  print('--- test.sequenceCount')
  mdb = MotifDb #()
  x = mcols(mdb)
  if(interactive()) {
    x.up = subset(x, dataSource == 'UniPROBE')
    checkTrue(all(is.na(x.up$sequenceCount)))
    }
  else {
    uniprobe.indices = which(x$dataSource == 'UniPROBE')
    checkTrue(all(is.na(x$sequenceCount [uniprobe.indices])))
    }

} # test.sequenceCount
#------------------------------------------------------------------------------------------------------------------------
# make sure that a legitimate organism-dataSource-identifier is supplied for each matrix and as a rowname
# of the corresponding DataFrame
test.longNames = function()
{
  print('--- test.longNames')
  mdb <- MotifDb
  longNames <- strsplit(names(mdb), '-')
  organisms <- unique(sapply(longNames, '[', 1))

  dataSources <- unique(lapply(longNames, '[', 2))

  recognized.dataSources <- c("cisbp_1.02", "FlyFactorSurvey",
                              "HOCOMOCOv10", "HOCOMOCOv11B-core", "HOCOMOCOv11C-core", "HOCOMOCOv11B-full",
                              "HOCOMOCOv11C-full", "HOCOMOCOv11A-core", "HOCOMOCOv11D-full",
                              "HOCOMOCOv11A-full", "HOMER", "hPDI",
                              "JASPAR_CORE", "JASPAR_2014", "jaspar2016", "jaspar2018",
                              "jolma2013", "ScerTF", "stamlab", "SwissRegulon", "UniPROBE")

  recognized.dataSources <-  c("cisbp_1.02",
                             "FlyFactorSurvey",
                             "HOCOMOCOv10",
                             "HOCOMOCOv11-core-A",
                             "HOCOMOCOv11-core-B",
                             "HOCOMOCOv11-core-C",
                             "HOCOMOCOv11-full-A",
                             "HOCOMOCOv11-full-B",
                             "HOCOMOCOv11-full-C",
                             "HOCOMOCOv11-full-D",
                             "HOMER",
                             "hPDI",
                             "JASPAR_2014",
                             "JASPAR_CORE",
                             "jaspar2016",
                             "jaspar2018",
                             "jolma2013",
                             "ScerTF",
                             "stamlab",
                             "SwissRegulon",
                             "UniPROBE")

  recognized.organisms <- unique(mcols(mdb)$organism)
    # a few(3) matrices from JASPAR core have NA organism.  make this into a character
    # so that it can be matched up against the 'NA' extracted from longNames just above
  na.indices <- which(is.na(recognized.organisms))
  if(length(na.indices) > 0)
     recognized.organisms [na.indices] <- 'NA'

  checkTrue(all(organisms %in% recognized.organisms))
  # new hocomoco-[core|full]-[ABCD] dataSource names are not incorporated into rownames yet
  # checkTrue(all(dataSources %in% recognized.dataSources))

  } # test.longNames
#------------------------------------------------------------------------------------------------------------------------
# make sure that a legitimate organism is specified for each matrix
test.organisms <- function()
{
  print('--- test.organisms')
  mdb <- MotifDb #(quiet=TRUE)
  organisms <- mcols(mdb)$organism

     # jaspar_core has 3 NA speciesId: TBP, HNF4A and CEBPA(MA0108.2, MA0114.1, MA0102.2)
     # their website shows these as vertebrates, which I map to 'Vertebrata'.  An organismID of '-'
  # gets the same treatment, matching website also.

  ### Note: this failing test is the same as the test.noNAorganisms test!
  # As in case of noNA, need to add organisms for these
  #checkEquals(which(is.na(mcols(MotifDb)$organism)), integer(0))

  empty.count <- length(which(mcols(mdb)$organism==""))
  checkEquals(empty.count, 0)

} # test.organisms
#------------------------------------------------------------------------------------------------------------------------
test.bindingDomains <- function()
{
  print('--- test.bindingDomains')
  mdb <- MotifDb #(quiet=TRUE)
  checkTrue(length(unique(mcols(mdb)$bindingDomain)) > 1)

} # test.bindingDomains
#------------------------------------------------------------------------------------------------------------------------
test.flyBindingDomains <- function()
{
  print('--- test.flyBindingDomains')

  x <- mcols(MotifDb)
  tmp <- as.list(head(sort(table(subset(x, organism=='Dmelanogaster')$bindingDomain), decreasing=TRUE), n=3))

    # these counts will likely change with a fresh load of data from FlyFactorSurvey.

  checkEquals(tmp$Homeobox, 212)
  checkEquals(tmp[['zf-C2H2']], 160)
  checkEquals(tmp[["Helix-Turn-Helix"]], 182)
  checkTrue(length(which(is.na(subset(x, organism=='Dmelanogaster')$bindingDomain))) > 300) # lots of cisbp

} # test.flyBindingDomains
#------------------------------------------------------------------------------------------------------------------------
test.experimentTypes <- function()
{
  print('--- test.experimentTypes')
  mdb <- MotifDb #(quiet=TRUE)
  x <- mcols(mdb)
  checkTrue(length(unique(x$experimentType)) >= 18)
  checkEquals(length(which(x$experimentType=='')), 0)

} # test.experimentTypes
#------------------------------------------------------------------------------------------------------------------------
test.tfFamilies = function()
{
  print('--- test.tfFamilies')
  mdb = MotifDb #(quiet=TRUE)
  checkTrue(length(unique(mcols(mdb)$tfFamily)) > 1)

} # test.tfFamilies
#------------------------------------------------------------------------------------------------------------------------
test.bindingSequences = function()
{
  print('--- test.bindingSequences')
  mdb = MotifDb #(quiet=TRUE)
  checkTrue(length(unique(mcols(mdb)$bindingSequence)) > 1)

} # test.tfFamilies
#------------------------------------------------------------------------------------------------------------------------
test.pubmedIDs = function()
{
  print('--- test.pubmedIDs')
  x = mcols(MotifDb) #(quiet=TRUE))
  checkTrue(length(unique(x$pubmedID)) >= 139)
  checkEquals(length(which(x$pubmedID == '')), 0)

} # test.pubmedIDs
#------------------------------------------------------------------------------------------------------------------------
# every matrix, and every corresponding metadata table row, has a name formed thus:
#
#    dataSource-organism-nativeGeneName
#
#  where nativeGeneName is how the matrix is named in the dataSource from which it comes.
#  thus:
#          FlyFactorSurvey-Dmelanogaster-Abd.A_FlyReg_FBgn0000014
#          UniPROBE-Scerevisiae-Asg1-UP00350
#          ScerTF-Scerevisiae-ABF2-badis
#          JASPAR_CORE-Rrattus-Ar-MA0007.1
#
skip.test.allFullNames = function()
{
  print('--- test.allFullNames')
  mdb = MotifDb #(quiet=TRUE)
  matrices = mdb@listData
  fullNames = names(matrices)

  all.dataSources = unique(mcols(mdb)$dataSource)
  checkTrue(length(all.dataSources) >= 4)

  for(source in all.dataSources) {
     this.dataSource <- source
     matrices.by.source = subset(mdb, dataSource==this.dataSource)
     matrix.name = names(matrices.by.source)[1]
        #  FlyFactorSurvey: Dmelanogaster-FlyFactorSurvey-ab_SANGER_10_FBgn0259750
        #             hPDI: Hsapiens-hPDI-ABCF2
        #      JASPAR_CORE: JASPAR_CORE-Athaliana-AGL3-MA0001.1
        #         UniPROBE: Hsapiens-UniPROBE-Sox4.UP00401
     checkTrue(grep(this.dataSource, matrix.name) == 1)
     #printf('%20s: %s', source, matrix.name)
     }

  return(TRUE)

} # test.allFullNames
#------------------------------------------------------------------------------------------------------------------------
test.subset = function()
{
  if(interactive()) {
    print('--- test.subset')
    mdb = MotifDb
    checkTrue('geneSymbol' %in% colnames(elementMetadata(mdb)))
    mdb.sub = subset(mdb, geneSymbol=='ABCF2')
    checkEquals(length(mdb.sub), 1)
    } # if interactive

} # test.subset
#------------------------------------------------------------------------------------------------------------------------
test.subsetWithVariables = function()
{
  if(interactive()) {
    print('--- test.subsetWithVariables')

    mdb = MotifDb #()
    checkTrue('geneSymbol' %in% colnames(elementMetadata(mdb)))
    target.gene <<- 'ABCF2'
    mdb.sub = subset(mdb, geneSymbol==target.gene)
    checkEquals(length(mdb.sub), 1)
    } # if interactive

} # test.subsetWithVariables
#------------------------------------------------------------------------------------------------------------------------
# "old style": just one query term allowed
test.queryOldStyle = function()
{
  print('--- test.queryOldStyle')
  mdb = MotifDb

    # do queries on dataSource counts match those from a contingency table?
  sources.list = as.list(table(mcols(mdb)$dataSource))
  checkEquals(length(query(mdb, 'flyfactorsurvey')), sources.list$FlyFactorSurvey)
  checkEquals(length(query(mdb, 'uniprobe')), sources.list$UniPROBE)
  checkEquals(length(query(mdb, 'UniPROBE')), sources.list$UniPROBE)

    # gene symbols which begin with 'sox' are quite common.  can we them?
    # there are currently(19 jul 2012) 18, but since this may change, our test is approximate

  # Change on 8/1/2017: increase top limit of sox entries as they've expanded
  sox.entries = query(mdb, '^sox')
  checkTrue(length(sox.entries) > 10)
  checkTrue(length(sox.entries) < 200)

    # manual inspection reveals that some of these genes have names which are all capitalized.  test that.
  checkTrue(length(query(mdb, '^sox', ignore.case=TRUE)) > length(query(mdb, '^SOX', ignore.case=FALSE)))

    # make sure that queries can be stacked up, and that order of call does not affect the outcome
   uniprobe.sox.matrices = query(query(mdb, 'uniprobe'), '^sox')
   sox.uniprobe.matrices = query(query(mdb, '^sox'), 'uniprobe')

   checkEquals(length(uniprobe.sox.matrices), length(sox.uniprobe.matrices))

     # an approximate count check
  checkTrue(length(uniprobe.sox.matrices) > 10)
  checkTrue(length(uniprobe.sox.matrices) < 30)

   checkEquals(unique(mcols(uniprobe.sox.matrices)$dataSource), 'UniPROBE')
   checkEquals(unique(mcols(sox.uniprobe.matrices)$dataSource), 'UniPROBE')
   gene.symbols = sort(unique(mcols(uniprobe.sox.matrices)$geneSymbol))

    # query on a string(in this case, an oddly named motif) which contains
    # regular expression characters.  no solution to this yet.
    # query uses base R's grep, in which
    #  x <- query(mdb, "ELK1,4_GABP{A,B1}.p3")

} # test.queryOldStyle
#------------------------------------------------------------------------------------------------------------------------
test.query <- function()
{
  print('--- test.query')
  mdb = MotifDb

  ors <- c("MA0511.1", "MA0057.1")
  ands <- c("jaspar2018", "sapiens")
  nots <- "cisbp"

  x <- query(mdb, andStrings=ands, orStrings=ors)
  checkEquals(length(x), 2)
  checkEquals(sort(names(x)),
             c("Hsapiens-jaspar2018-MZF1(var.2)-MA0057.1", "Hsapiens-jaspar2018-RUNX2-MA0511.1"))

  x <- query(mdb, andStrings="MA0057.1")
  checkEquals(length(x), 15)

  x <- query(mdb, andStrings=c("MA0057.1", "cisbp"))
  checkEquals(length(x), 11)

  x <- query(mdb, andStrings=c("MA0057.1"), notStrings="cisbp")
  checkEquals(length(x), 4)

  x <- query(mdb, andStrings=c("MA0057.1"), notStrings=c("cisbp", "JASPAR_2014"))
  checkEquals(length(x), 3)

  x <- query(mdb, orStrings=c("mus", "sapiens"), andStrings="MA0057.1")
  #checkEquals(sort(names(x)),

    # do queries on dataSource counts match those from a contingency table?
  sources.list = as.list(table(mcols(mdb)$dataSource))
  checkEquals(length(query(mdb, 'flyfactorsurvey')), sources.list$FlyFactorSurvey)
  checkEquals(length(query(mdb, 'uniprobe')), sources.list$UniPROBE)
  checkEquals(length(query(mdb, 'UniPROBE')), sources.list$UniPROBE)

} # test.query
#------------------------------------------------------------------------------------------------------------------------
test.query2 <- function()
{
  mdb <- MotifDb
  matrices.human <- query(mdb, 'hsapiens')
  matrices.sox4 <- query(mdb, 'sox4')

  matrices.human.sox4 <- query(mdb, c("hsapiens", "sox4"))
  matrices.human.sox4.oldStyle <- query(matrices.human, "sox4")
  checkTrue(length(matrices.human.sox4) > 0)   # 6 found on(24 oct 2018)
  checkEquals(length(matrices.human.sox4), length(matrices.human.sox4.oldStyle))

  checkTrue(length(matrices.human.sox4) < length(matrices.human))
  checkTrue(length(matrices.human.sox4) < length(matrices.sox4))

  uniprobe.sox.matrices <- query(mdb, c('uniprobe', '^sox'))

} # test.query2
#------------------------------------------------------------------------------------------------------------------------
test.transformMatrixToMemeRepresentation = function()
{
  print('--- test.transformMatrixToMemeRepresentation')
  mdb = MotifDb #()
  matrix.agl3 = subset(mdb, dataSource=='JASPAR_CORE' & organism=='Athaliana' & geneSymbol=='AGL3')[[1]]
  checkEquals(dim(matrix.agl3), c(4, 10))

    # a transposed frequency is needed for meme.  we store normalized frequency matrices, to just transposition is needed
  tfm = MotifDb:::transformMatrixToMemeRepresentation(matrix.agl3)
  checkEquals(dim(tfm), c(10, 4))
  checkTrue(all(round(rowSums(tfm)) == 1.0))

} # test.transformMatrixToMemeRepresentation
#------------------------------------------------------------------------------------------------------------------------
# translate a motif matrix from MotifList into text suitable for meme and tomtom input.
# choose the top two from UniPROBE.  they reliably have high counts, and easily distinguished normalized frequencies
test.matrixToMemeText = function()
{
  print('--- test.matrixToMemeText')
  sox4 = subset(MotifDb, dataSource=='UniPROBE' & organism=='Hsapiens' & geneSymbol=='Sox4')
  rtg3 = subset(MotifDb, dataSource=='UniPROBE' & organism=='Scerevisiae' & geneSymbol=='Rtg3')

  text.sox4 = MotifDb:::matrixToMemeText(sox4)
     # check these with t(sox4 [[1]])
  line1.sox4  = " 0.2457457130  0.1950426500  0.2287887620  0.3304228750"
  line14.sox4 = " 0.2821643030  0.2286132160  0.1585395830  0.3306828990"

  checkEquals(length(text.sox4), 29)
  checkEquals(text.sox4 [1], "MEME version 4")
  checkEquals(text.sox4 [10], "MOTIF Hsapiens-UniPROBE-Sox4.UP00401")
  checkEquals(grep(line1.sox4, text.sox4), 12)
  checkEquals(grep(line14.sox4, text.sox4), 25)

  text.rtg3 = MotifDb:::matrixToMemeText(rtg3)
  line1.rtg3  = " 0.3935122858  0.1453016447  0.3308830322  0.1303030373"
  line20.rtg3 = " 0.2490417648  0.3966478493  0.1083586569  0.2459517291"
  checkEquals(length(text.rtg3), 35)         # 4 trailing empty lines
  checkEquals(text.rtg3 [1], "MEME version 4")
  checkEquals(text.rtg3 [10], "MOTIF Scerevisiae-UniPROBE-Rtg3.UP00356")
  checkEquals(grep(line1.rtg3, text.rtg3), 12)
  checkEquals(grep(line20.rtg3, text.rtg3), 31)

    # now call with both matrices, and see if the right results are returned
  text.both = MotifDb:::matrixToMemeText(c(sox4, rtg3))
  checkEquals(text.both [1], "MEME version 4")
  checkEquals(grep(line1.sox4, text.both), 12)
  checkEquals(grep(line14.sox4, text.both), 25)
  checkEquals(grep(line1.rtg3, text.both),  29)
  checkEquals(grep(line20.rtg3, text.both), 48)

} # test.matrixToMemeText
#------------------------------------------------------------------------------------------------------------------------
# translate a motif matrix from MotifList into text suitable for meme and tomtom input.
# choose the top two from UniPROBE.  they reliably have high counts, and easily distinguished normalized frequencies
#test.matrixToMemeText_mapVersion = function()
#{
#  print('--- test.matrixToMemeText_mapVersion')
#  sox4 = subset(MotifDb, dataSource=='UniPROBE' & organism=='Hsapiens' & geneSymbol=='Sox4')
#  rtg3 = subset(MotifDb, dataSource=='UniPROBE' & organism=='Scerevisiae' & geneSymbol=='Rtg3')
#
#  text.sox4 = MotifDb:::mapped.broken.matrixToMemeText(sox4)
#  text.rtg3 = MotifDb:::mapped.broken.matrixToMemeText(rtg3)
#  text.both = MotifDb:::mapped.broken.matrixToMemeText(c(sox4, rtg3))
#
#     # check these with t(sox4 [[1]])
#  line1.sox4  = " 0.2457457130  0.1950426500  0.2287887620  0.3304228750"
#  line14.sox4 = " 0.2821643030  0.2286132160  0.1585395830  0.3306828990"
#
#  checkEquals(length(text.sox4), 29)
#  checkEquals(text.sox4 [1], "MEME version 4")
#  checkEquals(text.sox4 [10], "MOTIF Hsapiens-UniPROBE-Sox4.UP00401")
#  checkEquals(grep(line1.sox4, text.sox4), 12)
#  checkEquals(grep(line14.sox4, text.sox4), 25)
#
#  line1.rtg3  = " 0.3935122858  0.1453016447  0.3308830322  0.1303030373"
#  line20.rtg3 = " 0.2490417648  0.3966478493  0.1083586569  0.2459517291"
#  checkEquals(length(text.rtg3), 35)         # 4 trailing empty lines
#  checkEquals(text.rtg3 [1], "MEME version 4")
#  checkEquals(text.rtg3 [10], "MOTIF Scerevisiae-UniPROBE-Rtg3.UP00356")
#  checkEquals(grep(line1.rtg3, text.rtg3), 12)
#  checkEquals(grep(line20.rtg3, text.rtg3), 31)
#
#    # now call with both matrices, and see if the right results are returned
#  checkEquals(text.both [1], "MEME version 4")
#  checkEquals(grep(line1.sox4, text.both), 12)
#  checkEquals(grep(line14.sox4, text.both), 25)
#  checkEquals(grep(line1.rtg3, text.both),  29)
#  checkEquals(grep(line20.rtg3, text.both), 48)
#
#} # test.matrixToMemeText_mapVersion
#------------------------------------------------------------------------------------------------------------------------
test.export_memeFormatStdOut = function()
{
  print('--- test.export_memeFormatStdOut')
  mdb = MotifDb #()
  mdb.chicken = subset(mdb, organism=='Gallus')
  checkEquals(length(mdb.chicken), 3)
    # text is cat-ed to stdout, so not avaialable here to check.
    # but just like print, export also returns the text invisibly.
    # so that CAN be checked.

  meme.text = export(mdb.chicken, format='meme')
  checkEquals(length(meme.text), 1)   # just one long string
  checkTrue(is.character(meme.text))
  checkTrue(nchar(meme.text) > 800)   # 1002 as of(10 aug 2012)
  return(TRUE)

} # test.exportMemeFormatToStdOut
#------------------------------------------------------------------------------------------------------------------------
test.export_memeFormatToFile = function()
{
  print('--- test.export_memeFormatToFile')
  mdb = MotifDb #()
  mdb.chicken = subset(mdb, organism=='Gallus')
  checkEquals(length(mdb.chicken), 3)
  output.file = tempfile()
  meme.text = export(mdb.chicken, output.file, 'meme')
  retrieved = scan(output.file, what=character(0), sep='\n', quiet=TRUE)
  invisible(retrieved)

} # test.exportMemeFormatToFile
#------------------------------------------------------------------------------------------------------------------------
test.export_memeFormatToFileDuplication = function()
{
  print('--- test.export_memeFormatToFileDuplication')
  mdb = MotifDb #()
  mdb.mouse = subset(mdb, organism=='Mmusculus')
  checkTrue(length(mdb.mouse) > 1300)
  output.file = 'mouse.txt' # tempfile()
  max = 3
  meme.text = export(mdb.mouse [1:max], output.file, 'meme')
  retrieved = scan(output.file, what=character(0), sep='\n', quiet=TRUE)
  invisible(retrieved)

} # test.exportMemeFormatToFileDuplication
#------------------------------------------------------------------------------------------------------------------------
test.export_memeFormatToFile_run_tomtom = function(max=50)
{
  if(interactive()) {
    print('--- test.export_memeFormatToFile_run_tomtom')
    mdb = MotifDb
    sox4.mouse = subset(mdb, organism=='Mmusculus' & geneSymbol=='Sox4')
    all.human = subset(mdb, organism=='Hsapiens')

    tomtom.tmp.dir = tempfile()
    print(tomtom.tmp.dir)
    stopifnot(dir.create(tomtom.tmp.dir))
    sox4.file.path = file.path(tomtom.tmp.dir, 'sox4.mouse.text')
    all.human.file.path = file.path(tomtom.tmp.dir,'all.human.text')
    export(sox4.mouse, sox4.file.path, 'meme')
    export(all.human, all.human.file.path, 'meme')

       # find similarity of motif #1 to all the motifs in mdbMany

       # cannot rely upon tomtom being present

    #cmd = sprintf('tomtom -no-ssc -oc %s -verbosity 3 -min-overlap 5 -mi 1 -dist pearson -evalue -thresh 10 %s %s',
    #                tomtom.tmp.dir, sox4.file.path, all.human.file.path)
    #system(cmd)
    #cmd = sprintf('open %s/tomtom.html', tomtom.tmp.dir)
    #system(cmd)
    } # if interactive

} # test.export_memeFormatToFile_run_tomtom
#------------------------------------------------------------------------------------------------------------------------
# MotIV::motifMatch fails with MotIV_1.25.0.  will look into this in September, 2015, pshannon
# MotIV abandoned, all of my own motif/sequence matching is done via an independently installed
# FIMO from the meme suite.
# another option is the MOODs motif matching capability provided by the bioc package "motifmatchr"
# this possibility, and these tests, are deferred for now.
disabled_test.run_MotIV.motifMatch = function()
{
  require(MotIV)
  print('--- test.run_MotIV.motifMatch')
  mdb <- MotifDb #()

  db.tmp <- mdb@listData

     # match motif 1 against the entire MotifDb  collection
  motif.hits <- motifMatch(db.tmp [1], database<-db.tmp)
     # the long way to extract the matrix name.  see MotIV.toTable below for more convenient way
  checkEquals(motif.hits@bestMatch[[1]]@aligns[[1]]@TF@name, names(db.tmp)[1])

     # match the last motif against all
  last <- length(db.tmp)
     # MotIV:motifMatch works differently on linux and macos.  by asking for 50 matches,
     # the search target(db.tmp[last]) is sure to be in the hit list.
  motif.hits <-  motifMatch(db.tmp [last], database=db.tmp, top=50)
  tbl.hits <- MotIV.toTable(motif.hits)
    # the 5 hits return should include the one we tried to match, but the MotIV search strategy
    # may not place it first
  checkTrue(names(db.tmp[last]) %in% tbl.hits$name)
  invisible(tbl.hits)

} # distable_test.run_MotIV.motifMatch
#------------------------------------------------------------------------------------------------------------------------
MotIV.toTable = function(match)
{
  stopifnot(length(match@bestMatch) >= 1)
  alignments = match@bestMatch[[1]]@aligns

  df = data.frame(stringsAsFactors=FALSE)
  for(alignment in alignments) {
    x = alignment
    name = x@TF@name
    eVal = x@evalue
    sequence = x@sequence
    match = x@match
    strand = x@strand
    df = rbind(df, data.frame(name=name, eVal=eVal, sequence=sequence, match=match, strand=strand, stringsAsFactors=FALSE))
    } # for alignment

  return(df)

} # MotIV.toTable
#------------------------------------------------------------------------------------------------------------------------
disabled_test.MotIV.toTable = function()
{
  print('--- test.MotIVtoTable')
  mdb = MotifDb #()
  test.hits = motifMatch(mdb[1]@listData, database=jaspar)
  tbl.hits =  MotIV.toTable(test.hits)
  checkEquals(dim(tbl.hits), c(5, 5))
  checkEquals(sort(colnames(tbl.hits)), sort(c("name", "eVal", "sequence", "match", "strand")))

} # test.MotIV.toTable
#------------------------------------------------------------------------------------------------------------------------
pwmMatch.toTable = function(motifMatch) {
   if(length(motifMatch@bestMatch) == 0)
   return(NA)

   df.list = vector("list", length(motifMatch@bestMatch))
   for(k in seq(length(motifMatch@bestMatch)))
   {
       alignments = motifMatch@bestMatch[[k]]@aligns
       df = data.frame(stringsAsFactors=FALSE)
       for(alignment in alignments) {
           x = alignment
           name = x@TF@name
           eVal = x@evalue
           sequence = x@sequence
           match = x@match
           strand = x@strand
           df = rbind(df, data.frame(name=name, eVal=eVal, sequence=sequence, match=match, strand=strand, stringsAsFactors=FALSE))
       } # for alignment
       df.list[[k]]=df
   }
   names(df.list) <- names(motifMatch)
   return(df.list)

} # pwmMatch.toTable
#------------------------------------------------------------------------------
# Robert Stojnic reports incorrect gene symbols for matrices obtained from
# flyFactorSurvey.
# the solution was to abandon the original strategy of extracting the
# symbol from the matrix(and file) name.
# now, the flybase importer("inst/scripts/import/flyFactorSurvey/import.R")
# uses FBgn id(which can be reliably extracted) and uses indpendent
# data sources to learn the gene symbol.
#
# robert's email:
#  I'm working on using MotifDb motifs in my PWMEnrich package and I
#  have noticed that there is a slight problem with gene symbols for
#  Drosophila. In particular, the gene symbols do not always correspond
#  to the gene ID and are frequently mis-capitalized. In Drosophila z
#  and Z are two different genes and capitalization does matter if
#  someone is to use the gene symbols. Also, in some cases the symbols
#  are missing hyphens or parenthesis. I have used the gene IDs and the
#  Flybase annotation database to set the correct gene symbols for
#  Drosophila, please find attached the result of my re-annotation.
#
#  looking at his correctedMotifDbDmel.csv
#
#    head(read.table("correctedMotifDbDmel.csv", sep=",", header=TRUE, stringsAsFactors=FALSE))
#                  providerName oldGeneSymbol newGeneSymbol
#    1 ab_SANGER_10_FBgn0259750            Ab            ab
#    2  ab_SOLEXA_5_FBgn0259750            Ab            ab
#    3 Abd-A_FlyReg_FBgn0000014         Abd-a         abd-A
#    4 Abd-B_FlyReg_FBgn0000015         Abd-b         Abd-B
#    5    AbdA_Cell_FBgn0000014          Abda         abd-A
#    6  AbdA_SOLEXA_FBgn0000014          Abda         abd-A
#
test.flyFactorGeneSymbols <- function()
{
    print("--- test.flyFactorGeneSymbols")
    mdb = MotifDb
    checkEquals(sort(mcols(query(mdb, "FBgn0259750"))$geneSymbol),
                sort(c("FBgn0259750", "FBgn0259750")))
    checkEquals(mcols(query(mdb, "FBgn0000014"))$geneSymbol, rep("abd-A", 3))
    checkEquals(mcols(query(mdb, "FBgn0000015"))$geneSymbol, rep("Abd-B", 3))

} # test.flyFactorGeneSymbols
#-------------------------------------------------------------------------------
test.export_jasparFormatStdOut = function()
{
  print('--- test.export_jasparFormatStdOut')
  mdb = MotifDb #()
  mdb.chicken = subset(mdb, organism=='Gallus')
  checkEquals(length(mdb.chicken), 3)
    # text is cat-ed to stdout, so not avaialable here to check.
    # but just like print, export also returns the text invisibly.
    # so that CAN be checked.

  jaspar.text = export(mdb.chicken, format='jaspar')
  checkEquals(length(jaspar.text), 1)   # just one long string
  checkTrue(is.character(jaspar.text))
  checkTrue(nchar(jaspar.text) > 800)   # 1002 as of(10 aug 2012)
  return(TRUE)

} # test.exportjasparFormatToStdOut
#------------------------------------------------------------------------------------------------------------------------
test.export_jasparFormatToFile = function()
{
  print('--- test.export_jasparFormatToFile')
  mdb = MotifDb #()
  mdb.chicken = subset(mdb, organism=='Gallus')
  checkEquals(length(mdb.chicken), 3)
  output.file = tempfile()
  jaspar.text = export(mdb.chicken, output.file, 'jaspar')
  retrieved = scan(output.file, what=character(0), sep='\n', quiet=TRUE)
  invisible(retrieved)

} # test.exportjasparFormatToFile
#------------------------------------------------------------------------------------------------------------------------
test.geneToMotif <- function()
{
   printf("--- test.geneToMotif")
   mdb <- MotifDb

   genes <- c("FOS", "ATF5", "bogus", "SATB2")
   good.genes <- genes[-which(genes=="bogus")]
      # use  TFClass family classifcation
   tbl.tfClass <- geneToMotif(mdb, genes, source="TfClaSS")   # intentional mis-capitalization
   checkTrue(all(good.genes %in% tbl.tfClass$gene))

   expected.motifs <- c("MA0833.1", "MA0099.2", "MA0476.1", "MA0679.1", "MA0754.1", "MA0755.1", "MA0756.1", "MA0757.1")
   checkTrue(all(expected.motifs %in% tbl.tfClass$motif))
   checkEquals(unique(tbl.tfClass$source), "TFClass")

      # MotifDb mode uses the MotifDb metadata, pulled from many sources
   tbl.mdb <- geneToMotif(mdb, genes, source="mOtifdb")     # intentional mis-capitalization
   checkEquals(dim(tbl.mdb), c(14, 6))
   checkEquals(subset(tbl.mdb, dataSource=="jaspar2016" & geneSymbol== "FOS")$motif, "MA0476.1")
      # no recognizable(i.e., jaspar standard) motif name returned by MotifDb metadata
      # MotifDb for ATF5
      # todo: compare the MA0110596_1.02 matrix of cisp_1.02 to japar MA0833.1

     # check use of ignore.case
   tbl.caseSensitive <-  geneToMotif(MotifDb, "STAT4", source="MotifDb")
   checkEquals(length(grep("jaspar", tbl.caseSensitive$dataSource, ignore.case=TRUE)), 0)
   tbl.caseInsensitive <-  geneToMotif(MotifDb, "STAT4", source="MotifDb", ignore.case=TRUE)
   checkTrue(length(grep("jaspar", tbl.caseInsensitive$dataSource, ignore.case=TRUE)) >= 3)

   tbl.caseSensitive <-  geneToMotif(MotifDb, "stat4", source="TFclass")
   checkEquals(nrow(tbl.caseSensitive), 0)
   tbl.caseInsensitive <-  geneToMotif(MotifDb, "stat4", source="TFclass", ignore.case=TRUE)
   checkTrue(nrow(tbl.caseInsensitive) >= 5)

} # test.geneToMotif
#------------------------------------------------------------------------------------------------------------------------
# this case discovered(31 jan 2018). when called on a gene/source combination for which there are
# no motifs, i attempted to add the mapping source(either "MotifDb", "TFClass") as a column
# to an empty data.frame.  check for that and its fix here
test.geneToMotif.oneGene.noMotifs <- function()
{
   checkEquals(nrow(geneToMotif(MotifDb, "SATB2", "MotifDb")), 0)
   checkEquals(nrow(geneToMotif(MotifDb, "bogus-arandum", "MotifDb")), 0)
   checkEquals(nrow(geneToMotif(MotifDb, "bogus-arandum", "TFclass")), 0)

} # test.geneToMotif.oneGene.noMotifs
#------------------------------------------------------------------------------------------------------------------------
# sad to say I do not recall what problem/fix is tested here(pshannon, 23 jan 2018).
# however, it demonstrates the variety of results which can be returned by non-jaspar datasets
# when using the MotifDb mapping source, and the relative paucity which is sometimes
# seen with the TFclass mapper
test.geneToMotif.ignore.jasparSuffixes <- function()
{
   printf("--- test.geneToMotif.ignore.jasparSuffixes")
   mdb <- MotifDb

   genes <- c("FOS", "ATF5", "bogus")

      # use  TFClass family classifcation
   tbl.tfClass <- geneToMotif(mdb, genes, source="TfClaSS")   # intentional mis-capitalization
   checkEquals(sort(tbl.tfClass$gene),  sort(c("ATF5", "FOS", "FOS")))
   checkEquals(sort(tbl.tfClass$motif),  sort(c("MA0833.1", "MA0099.2", "MA0476.1")))
   checkEquals(tbl.tfClass$source, rep("TFClass", 3))

      # MotifDb mode uses the MotifDb metadata, pulled from many sources
   tbl.mdb <- geneToMotif(mdb, genes, source="mOtifdb")     # intentional mis-capitalization
   checkEquals(dim(tbl.mdb), c(14, 6))
   checkEquals(subset(tbl.mdb, dataSource=="jaspar2016" & geneSymbol== "FOS")$motif, "MA0476.1")
      # no recognizable(i.e., jaspar standard) motif name returned by MotifDb metadata
      # MotifDb for ATF5

      # compare the MA0110599_1.02 matrix of cisp_1.02 to japar MA0476.1: the identical matrix!
      # 1         FOS    MA0110599_1.02   cisbp_1.02  Hsapiens 24194598 MotifDb
      # 10        FOS          MA0476.1   jaspar2018  Hsapiens 17916232 MotifDb
      # this establishes the need for careful scrutiny as one winnows a geneToMotif result into
      # useful non-reduplicative sequence analysis

   pfm.ma0110599 <- as.list(query(mdb, "MA0110599"))[[1]]
   pfm.ma0476.1  <- as.list(query(query(mdb, "MA0476.1"), "jaspar2018"))[[1]]
   checkEquals(pfm.ma0110599, pfm.ma0476.1)

} # test.geneToMotif.ignore.jasparSuffixes
#------------------------------------------------------------------------------------------------------------------------
test.motifToGene <- function()
{
   printf("--- test.motifToGene")
   #printf("Sys.getlocale: %s", Sys.getlocale())

     # good test case of querying both sources,
   motif <- "MA0099.2"
   tbl <- motifToGene(MotifDb, motif, c("MotifDb", "TFclass"))
      # [1]   mdb.genes: AP1, JUN::FOS, FOS::JUN, FOS::JUN
      # [1]   tfc.genes: FOS, JUN
   checkEquals(sort(unique(tbl$geneSymbol)), c("AP1", "FOS", "FOS::JUN", "JUN", "JUN::FOS"))

   motifs <- c("MA0592.2", "UP00022", "ELF1.SwissRegulon")

      # MotifDb mode uses the MotifDb metadata "providerId",
   tbl.mdb <- motifToGene(MotifDb, motifs, source="MotifDb")
   checkEquals(dim(tbl.mdb), c(3, 5))
   expected <- sort(c("MA0592.2", "ELF1.SwissRegulon", "UP00022"))
   actual <- sort(tbl.mdb$motif)
   checkEquals(actual, expected)
   checkEquals(sort(tbl.mdb$geneSymbol), sort(c("ELF1", "Esrra", "Zfp740")))
   checkEquals(tbl.mdb$source,     rep("MotifDb", 3))

      # TFClass mode uses  TF family classifcation
   tbl.tfClass <- motifToGene(MotifDb, motifs, source="TFClass")
   checkEquals(dim(tbl.tfClass), c(9,5))
   checkEquals(tbl.tfClass$motif, rep("MA0592.2", 9))
   checkEquals(sort(tbl.tfClass$gene), sort(c("AR", "ESR1", "ESR2", "ESRRA", "ESRRB", "ESRRG", "NR3C1", "NR3C2", "PGR")))
   checkEquals(tbl.tfClass$source,       rep("TFClass", 9))

     # test motifs with regex characters in them, or other characters neither letter nor number
   motifs <- sort(c("DMAP1_NCOR{1,2}_SMARC.p2", "ELK1,4_GABP{A,B1}.p3", "SNAI1..3.p2", "EWSR1-FLI1.p2", "ETS1,2.p2"))
   tbl <- motifToGene(MotifDb, motifs, source="MotifDb")
   checkEquals(nrow(tbl), 0)

   tbl <- motifToGene(MotifDb, motifs, source="tfclass")
   checkEquals(ncol(tbl), 5)
   checkTrue(nrow(tbl) > 80)
   checkTrue(nrow(tbl) < 100)
   checkTrue(all(motifs %in% tbl$motif))

   motifs <- c("Hsapiens-HOCOMOCOv10-IKZF1_HUMAN.H10MO.C",
               "MA0099.2",
               "Hsapiens-SwissRegulon-ARID3A.SwissRegulon",
               "MA0592.2",
               "MA0036913_1.02")

   tbl <- motifToGene(MotifDb, motifs, source=c("MotifDb", "TFClass"))
   checkTrue(all(motifs %in% tbl$motif))

   motif <- "Mmusculus;Rnorvegicus;Hsapiens-jaspar2018-FOS::JUN-MA0099.2"
   tbl <- motifToGene(MotifDb, motif, source="TFClass")
   checkEquals(tbl$geneSymbol, c("FOS", "JUN"))

   motifs <- c("Hsapiens-jaspar2016-RUNX1-MA0002.1",
               "Hsapiens-jaspar2016-TFAP2A-MA0003.1",
               "Hsapiens-jaspar2016-TFAP2A-MA0003.2",
               "Hsapiens-jaspar2016-TFAP2A-MA0003.3",
               "Hsapiens-jaspar2016-AR-MA0007.2",
               "MA0872.1")
   tbl <- motifToGene(MotifDb, motifs, source="Motifdb")
   checkTrue(all(motifs %in% tbl$motif))
   checkEquals(sort(unique(tbl$geneSymbol)), c("AR", "RUNX1", "TFAP2A", "TFAP2A(var.3)"))

   tbl <- motifToGene(MotifDb, motifs, source="TFClass")
   checkEquals(sort(unique(tbl$motif)), c("Hsapiens-jaspar2016-TFAP2A-MA0003.3", "MA0872.1"))
   checkEquals(sort(unique(tbl$geneSymbol)), c("TFAP2A", "TFAP2B", "TFAP2C", "TFAP2D", "TFAP2E"))

   tbl <- motifToGene(MotifDb, motifs, source=c("MotifDb", "TFClass"))
   checkTrue(all(motifs %in% tbl$motif))
   checkEquals(sort(unique(tbl$geneSymbol)),
                    c("AR", "RUNX1", "TFAP2A", "TFAP2A(var.3)", "TFAP2B", "TFAP2C", "TFAP2D", "TFAP2E"))

      #(23 may 2018) found that MotifDb works, but c("MotifDb", "TFClass") does not
      # test the fix here

   motifs <- c("Hsapiens-jolma2013-IRF5-2", "Hsapiens-SwissRegulon-IRF5.SwissRegulon")
   checkEquals(motifToGene(MotifDb, motifs, source=c("MotifDb"))$geneSymbol, c("IRF5", "IRF5"))
   checkEquals(nrow(motifToGene(MotifDb, motifs, source=c("TFClass"))), 0)
   checkEquals(motifToGene(MotifDb, motifs, source=c("MotifDb", "TFClass"))$geneSymbol, c("IRF5", "IRF5"))

   } # test.motifToGene
#------------------------------------------------------------------------------------------------------------------------
test.associateTranscriptionFactors <- function()
{
   printf("--- test.associateTranscriptionFactors")

   mdb <- MotifDb
   pfms <- query(mdb, andStrings=c("sapiens", "jaspar2016"))

      # query(mdb, "jaspar2016", "sapiens")
      # first check motifs with MotifDb-style long names, using MotifDb lookup, in the
      # metadata of MotifDb:
      #      "Hsapiens-jaspar2016-RUNX1-MA0002.1"  "Hsapiens-jaspar2016-TFAP2A-MA0003.1"

   motif.names <- c(names(pfms[1:5]), "MA0872.1", "hocus.pocus")
   tbl <- data.frame(motifName=motif.names, score=runif(7), stringsAsFactors=FALSE)

   tbl.anno.mdb <- associateTranscriptionFactors(mdb, tbl, source="MotifDb", expand.rows=TRUE)
   checkEquals(nrow(tbl), nrow(tbl.anno.mdb))
   checkTrue(is.na(tbl.anno.mdb$geneSymbol[grep("hocus.pocus", tbl.anno.mdb$motifName)]))
   checkTrue(all(c("AR", "RUNX1", "TFAP2A", "TFAP2A", "TFAP2A", "TFAP2A(var.3)") %in% tbl.anno.mdb$geneSymbol))

   tbl.anno.tfc <- associateTranscriptionFactors(mdb, tbl, source="TFClass", expand.rows=TRUE)
   checkTrue(nrow(tbl) < nrow(tbl.anno.tfc))
   checkEquals(sort(unique(tbl.anno.tfc$geneSymbol)), c("TFAP2A", "TFAP2B", "TFAP2C", "TFAP2D", "TFAP2E"))

   tbl.anno.both <- associateTranscriptionFactors(mdb, tbl, source=c("MotifDb", "TFClass"), expand.rows=TRUE)
   checkEquals(length(grep("MotifDb", tbl.anno.both$source)), 6)
   checkEquals(length(grep("TFClass", tbl.anno.both$source)), 10)

   #   checkEquals(dim(tbl.anno.mdb), c(nrow(tbl), ncol(tbl) + 4))
   #   checkTrue(all(c("geneSymbol", "pubmedID") %in% colnames(tbl.anno.mdb)))
   #   checkEquals(sort(tbl.anno.mdb$geneSymbol),
   #               sort(c("AR", "RUNX1", "TFAP2A", "TFAP2A", "TFAP2A", "TFAP2A(var.3)")))
   #
   #      # now add in a bogus motif name, one for which there cannot possibly be a TF
   #
   #   motif.names[3] <- "bogus"
   #   tbl <- data.frame(motifName=motif.names, score=runif(5), stringsAsFactors=FALSE)
   #   tbl.anno <- associateTranscriptionFactors(mdb, tbl, source="MotifDb", expand.rows=FALSE)
   #   checkTrue(is.na(tbl.anno$geneSymbol[3]))
   #   checkTrue(is.na(tbl.anno$pubmedID[3]))
   #
   #      # now check motifs with short, traditional names, in "TFClass" mode, which uses
   #      # the tfFamily
   #      #      "MA0002.1" "MA0003.1" "MA0003.2" "MA0003.3" "MA0007.2"
   #
   #   motif.names <- names(pfms[1:5])
   #   short.motif.names <- unlist(lapply(strsplit(motif.names, "-"), function(tokens) return(tokens[length(tokens)])))
   #   tbl <- data.frame(motifName=motif.names, shortMotif=short.motif.names, score=runif(5), stringsAsFactors=FALSE)
   #
   #   tbl.anno <- associateTranscriptionFactors(mdb, tbl, source="TFClass", expand.rows=FALSE)
   #   checkEquals(dim(tbl.anno), c(nrow(tbl), ncol(tbl) + 2))
   #
   #      # TFClass only annotates MA0003.3, none of the others
   #   checkTrue(all(is.na(tbl.anno$geneSymbol[-4])))
   #   checkTrue(all(is.na(tbl.anno$pubmedID[-4])))
   #   checkEquals(tbl.anno$geneSymbol[4], "TFAP2A;TFAP2B;TFAP2C;TFAP2D;TFAP2E")
   #   checkEquals(tbl.anno$pubmedID[4],   "23180794")
   #
   #      # now ask for expandsion of the semicolon separated list
   #   tbl.anno <- associateTranscriptionFactors(mdb, tbl, source="TFClass", expand.rows=TRUE)
   #   checkEquals(dim(tbl.anno), c(nrow(tbl) + 4, ncol(tbl) + 2))
   #   checkTrue(all(c("TFAP2A", "TFAP2B", "TFAP2C", "TFAP2D", "TFAP2E") %in% tbl.anno$geneSymbol))
   #
   #      # now add in a bogus motif name, one for which there cannot possibly be a TF
   #
   #   motif.names <- names(pfms[1:5])
   #   short.motif.names <- unlist(lapply(strsplit(motif.names, "-"), function(tokens) return(tokens[length(tokens)])))
   #   short.motif.names[4] <- "bogus"
   #   tbl <- data.frame(shortMotif=short.motif.names, score=runif(5), stringsAsFactors=FALSE)
   #
   #   tbl.anno <- associateTranscriptionFactors(mdb, tbl, source="TFClass", expand.rows=FALSE)
   #   checkEquals(dim(tbl.anno), c(nrow(tbl), ncol(tbl) + 2))
   #      # after adding bogus to the only mapped motif name, all geneSymbol and pubmedID values should be NA
   #   checkTrue(all(is.na(tbl.anno$geneSymbol)))
   #   checkTrue(all(is.na(tbl.anno$pubmedID)))
   #
   #      # now make sure that the absence of the  TFClass-specific "shortMotif" field is detected
   #   motif.names <- names(pfms[1:5])
   #   tbl <- data.frame(motifName=motif.names, score=runif(5), stringsAsFactors=FALSE)
   #   checkException(tbl.anno <- associateTranscriptionFactors(mdb, tbl, source="TFClass", expand.rows=FALSE), silent=TRUE)

      # now some motif names
} # test.associateTranscriptionFactors
#------------------------------------------------------------------------------------------------------------------------
# disabled (2 apr 2020) due to very large (~100?) dependendencies introducted directly and indirectly
# via motifmatchr, TFBSTools, universalmotif
# test.match <- function()
# {
#    printf("--- test.match")
#    gr.region <- GRanges(seqnames="chr1", IRanges(start=47229520, end=47229560))
#    motifs <- query(MotifDb, c("jaspar2018", "ZNF263"))
#    checkEquals(length(motifs), 1)
#    gr.match <- matchMotif(MotifDb, motifs, "hg38", gr.region, 1e-5)
#    checkEquals(length(gr.match), 1)  # just one motif
#    checkEquals(names(gr.match), names(motifs))
#    checkEquals(length(gr.match[[1]]), 3)
#
#    tbl.match <- matchMotif(MotifDb, motifs, "hg38", gr.region, 1e-5, fimoDataFrameStyle=TRUE)
#    checkEquals(dim(tbl.match), c(3, 7))
#    checkTrue(all(tbl.match$motif == names(motifs)))
#    checkEquals(class(tbl.match$chrom), "character")  # not a factor
#
#    motifs <- query(MotifDb, "ZNF263", c("jaspar2018", "swissregulon"))
#    checkEquals(length(motifs), 2)
#    gr.match <- matchMotif(MotifDb, motifs, "hg38", gr.region, 1e-5)
#    checkEquals(names(gr.match), names(motifs))
#    checkEquals(as.numeric(lapply(gr.match, length)), c(3, 1))
#
#    tbl.match <-matchMotif(MotifDb, motifs, "hg38", gr.region, 1e-5, fimoDataFrameStyle=TRUE)
#    checkEquals(dim(tbl.match), c(4, 7))
#    checkEquals(length(unique(tbl.match$motif)), 2)
#    checkEquals(unique(tbl.match$motif), names(motifs))
#    checkEquals(colnames(tbl.match), c("chrom", "start", "end", "width", "strand", "mood.score", "motif_id"))
#
#
#         #------------------------------------------------
#         # now all jaspar2018 human motifs
#         #------------------------------------------------
#
#    motifs <- query(MotifDb, c("jaspar2018", "hsapiens"))
#    tbl.match <- matchMotif(MotifDb, motifs, "hg38", gr.region, 1e-5, fimoDataFrameStyle=TRUE)
#    checkEquals(dim(tbl.match), c(7, 7))
#    checkEquals(sort(unique(tbl.match$motif)),
#                c("Hsapiens-jaspar2018-EWSR1-FLI1-MA0149.1", "Hsapiens-jaspar2018-ZNF263-MA0528.1"))
#
#         #-----------------------------------------------------
#         # now all jaspar2018 human motifs, loosen the pValue
#         #-----------------------------------------------------
#
#    motifs <- query(MotifDb, c("jaspar2018", "hsapiens"))
#    tbl.match <- matchMotif(MotifDb, motifs, "hg38", gr.region, 1e-4, fimoDataFrameStyle=TRUE)
#    checkTrue(nrow(tbl.match) > 15)
#
#    tbl.match <- matchMotif(MotifDb, motifs, "hg38", gr.region, 1e-3, fimoDataFrameStyle=TRUE)
#    checkTrue(nrow(tbl.match) > 50)
#
#        #-------------------------------------------------------------
#        # now all jaspar2018 and hocomoco human motifs across 10kb
#        #------------------------------------------------------------
#
#    motifs <- query(MotifDb, "hsapiens", orStrings=c("jaspar2018", "hocomoco-core"))
#    checkTrue(length(motifs) > 500)
#    gr.region <- GRanges(seqnames="chr1", IRanges(start=47229000, end=47239000))
#
#    tbl.match <- matchMotif(MotifDb, motifs, "hg38", gr.region, 1e-7, fimoDataFrameStyle=TRUE)
#    checkTrue(nrow(tbl.match) > 90 && nrow(tbl.match) < 110)
#    checkEquals(order(tbl.match$start), seq_len(nrow(tbl.match)))
#
# } # test.match
#------------------------------------------------------------------------------------------------------------------------
findMotifsWithMutuallyExclusiveMappings <- function()
{
   xtab <- as.data.frame(table(MotifDb@manuallyCuratedGeneMotifAssociationTable$motif))
   xtab <- xtab[order(xtab$Freq, decreasing=TRUE),]
   for(motif in xtab$Var1){
      mdb.genes <- toupper(motifToGene(MotifDb, motif, "motifdb")$geneSymbol)
      tfc.genes <- toupper(motifToGene(MotifDb, motif, "tfclass")$geneSymbol)
      if(length(mdb.genes) == 0) next
      if(length(tfc.genes) == 0) next
      if(length(intersect(mdb.genes, tfc.genes)) == 0){
         printf("------ %s", motif)
         printf("  mdb.genes: %s", paste(mdb.genes, collapse=", "))
         printf("  tfc.genes: %s", paste(tfc.genes, collapse=", "))
        } # if no intersection
      } # for motif

   #  [1] ------ MA0099.2
   #  [1]   mdb.genes: AP1, JUN::FOS, FOS::JUN, FOS::JUN
   #  [1]   tfc.genes: FOS, JUN

} # findMotifsWithMutuallyExclusiveMappings
#------------------------------------------------------------------------------------------------------------------------
test.hocomoco11.with.reliabilityScores <- function()
{
   printf("--- test.hocomoco11.with.reliabilityScores")

     #-------------------------------------------------------------------------
     # these queries rely primarily upon the dataSoure column of the metadata
     # subsequent checks below look at metadata rownames and matrix names
     #-------------------------------------------------------------------------

   checkEquals(length(query(MotifDb, "hocomoco")), 1834)
   checkEquals(length(query(MotifDb, "hocomocov10")), 1066)
   checkEquals(length(query(MotifDb, "hocomocov11")), 768)
   checkEquals(length(query(MotifDb, "hocomocov11-core")), 400)
   checkEquals(length(query(MotifDb, "hocomocov11-secondary")), 368)

   checkEquals(length(query(MotifDb, "hocomocov11-core-A")), 181)
   checkEquals(length(query(MotifDb, "hocomocov11-secondary-A")), 46)

   checkEquals(length(query(MotifDb, "hocomocov11-core-B")), 84)
   checkEquals(length(query(MotifDb, "hocomocov11-secondary-B")), 19)

   checkEquals(length(query(MotifDb, "hocomocov11-core-C")), 135)
   checkEquals(length(query(MotifDb, "hocomocov11-secondary-C")), 13)

   checkEquals(length(query(MotifDb, "hocomocov11-core-D")), 0)
   checkEquals(length(query(MotifDb, "hocomocov11-secondary-D")), 290)

     #-------------------------------------------------------------------------
     # check matrix names
     #-------------------------------------------------------------------------
   checkEquals(length(grep("HOCOMOCOv11-core-A", names(MotifDb))), 181)
   checkEquals(length(grep("HOCOMOCOv11-secondary-A", names(MotifDb))), 46)

   checkEquals(length(grep("HOCOMOCOv11-core-A", rownames(mcols(MotifDb)))), 181)
   checkEquals(length(grep("HOCOMOCOv11-secondary-A", rownames(mcols(MotifDb)))), 46)

} # test.hocomoco11.with.reliabilityScores
#------------------------------------------------------------------------------------------------------------------------
if(!interactive())
   runTests()

Try the MotifDb package in your browser

Any scripts or data that you put into this service are public.

MotifDb documentation built on Nov. 8, 2020, 6:28 p.m.