MotifDb: An Annotated Collection of Protein-DNA Binding Sequence Motifs

# stamlab/test.R
# notes:  3 matrices come w/o speciesID, tax = 'vertebrates'.  not our problem to fix, at least not yet.
#         TBP, HNF4A and CEBPA (MA0108.2, MA0114.1, MA0102.2)
#------------------------------------------------------------------------------------------------------------------------
library (RUnit)
#------------------------------------------------------------------------------------------------------------------------
source("import.R")
#------------------------------------------------------------------------------------------------------------------------
run.tests = function (dataDir)
{
  dataDir <- file.path(dataDir, "stamlab")
  x.rawMatrixList <<- test.readRawMatrices (dataDir)
  x.novels <<- test.readNovelStatus (dataDir)
  x.matrices <<- test.extractAndNormalizeMatrices (x.rawMatrixList)
  x.tbl.md <<- test.createMetadataTable (x.matrices, x.novels)
  x.matrices.renamed <<- test.renameMatrices (x.matrices, x.tbl.md)

} # run.tests
#------------------------------------------------------------------------------------------------------------------------
test.readRawMatrices = function (dataDir)
{
  print ('--- test.readMatrices')
  list.pwms = readRawMatrices (dataDir)
  checkEquals (length (list.pwms), 683)
  checkEquals (names (list.pwms [[1]]), c ("title", "consensus.sequence", "matrix"))
  checkEquals (rownames (list.pwms[[1]]$matrix),  c ("A", "C", "G", "T"))
  invisible (list.pwms)

} # test.readRawMatrices
#------------------------------------------------------------------------------------------------------------------------
test.readNovelStatus = function (dataDir)
{
  print ('--- test.readNovelStatus')
  novel.status = readNovelStatus (dataDir)
  checkEquals (length (novel.status), 683)
  checkEquals (length (which (novel.status == TRUE)), 289)
    # do a spot check around first novel in novels.txt
  checkEquals (as.logical (novel.status [c ('UW.Motif.0010', 'UW.Motif.0011', 'UW.Motif.0012')]),
                             c (FALSE, FALSE, TRUE))
  invisible (novel.status)

} # readNovelStatus
#------------------------------------------------------------------------------------------------------------------------
test.extractAndNormalizeMatrices = function (x.rawMatrixList)
{
  print ('--- test.extractAndNormalizeMatrices')
  matrices.fixed <<- extractAndNormalizeMatrices (x.rawMatrixList)
    # make sure a UW.Motif.0nnn name accompanies each matrix
  checkEquals (length (grep ('UW.Motif.0', names (matrices.fixed))), length (matrices.fixed))
    # make sure all columns in all matrices sum to 1.0
  checkTrue (all (sapply (matrices.fixed, function (m) all (abs (colSums (m) - 1.0) < 1e-10))))
  invisible (matrices.fixed)

} # test.extractAndNormalizeMatrices
#------------------------------------------------------------------------------------------------------------------------
test.convertRawMatricesToStandard = function (tbl.rmat)
{
  print ('--- test.convertRawMatricesToStandard')
   # get just the first two raw matrices

  first.two.ids = head (unique (tbl.rmat$id), n=2)
  rows = nrow (subset (tbl.rmat, id %in% first.two.ids))
  matrices = convertRawMatricesToStandard (tbl.rmat [1:rows,])
  checkEquals (length (matrices), 2)
  checkEquals (names (matrices), first.two.ids)

    # it will not always be true, but IS true for the first two matrices, currently "9229" and  "9231", that there
    # are an equal number of nucleotides at each position.
  checkTrue (all (colSums (matrices [[1]]) == 97))
  checkTrue (all (colSums (matrices [[2]]) == 185))

    # now run all the matrices through
  matrices = convertRawMatricesToStandard (tbl.rmat)
  checkEquals (length (matrices), 459)
  checkEquals (names (matrices)[1:2], first.two.ids)
  
  invisible (matrices)
  
} # test.convertRawMatricesToStandard 
#------------------------------------------------------------------------------------------------------------------------
test.createAnnotationTable = function ()
{
  print ('--- test.createAnnotationTable')
  tbl.anno = createAnnotationTable ()
  checkEquals (dim (tbl.anno), c (513, 13))
  expected = c ("fullID", "id", "category", "mID", "version", "binder", "speciesID", "proteinID", "family", "tax", "class", "pubmed", "type")
  checkEquals (colnames (tbl.anno), expected)

  checkEquals (head (tbl.anno$fullID),  c ("MA0001.1", "MA0003.1", "MA0004.1", "MA0005.1", "MA0006.1", "MA0006.1"))
  invisible (tbl.anno)

} # test.createAnnotationTable
#------------------------------------------------------------------------------------------------------------------------
test.createMetadataTable = function (x.matrices, x.novels)
{
   print ('--- test.createMetadataTable')
    # try it first with just two matrices
   tbl.md = createMetadataTable (x.matrices [1:12], x.novels [1:12])
   checkEquals (dim (tbl.md), c (12, 15))
   checkEquals (colnames (tbl.md), c ("providerName", "providerId", "dataSource", "geneSymbol", "geneId", "geneIdType", 
                                      "proteinId", "proteinIdType", "organism", "sequenceCount", "bindingSequence",
                                      "bindingDomain", "tfFamily", "experimentType", "pubmedID"))
   checkEquals (tbl.md$providerName [1:2], c ('UW.Motif.0001', 'UW.Motif.0002'))
   checkEquals (tbl.md$providerId [1:2], c ('UW.Motif.0001', 'UW.Motif.0002'))
   checkEquals (tbl.md$pubmedID [1:2], c ("22955618", "22955618"))
   checkEquals (tbl.md$dataSource [1:2], c ('stamlab', 'stamlab'))
   checkEquals (tbl.md$organism [1:2], c ('Hsapiens', 'Hsapiens'))
   checkEquals (tbl.md$experimentType [1:2], c ('digital genomic footprinting', 'digital genomic footprinting'))
   checkEquals (tbl.md$geneId, c (rep ('knownMotif', 11), 'novelMotif'))
   checkTrue(all(is.na(tbl.md$geneIdType)))

   invisible (tbl.md)

} # test.createMetadataTable
#------------------------------------------------------------------------------------------------------------------------
test.renameMatrices = function (matrices, tbl.md)
{
  print("--- test.renameMatrices")
  
    # try it with just the first two matrices
  matrix.pair = matrices [1:2]
  tbl.pair = tbl.md [1:2,]
  matrix.pair.renamed = renameMatrices (matrix.pair, tbl.pair)
  checkEquals (names (matrix.pair.renamed), c ("Hsapiens-stamlab-UW.Motif.0001", "Hsapiens-stamlab-UW.Motif.0002"))

} # test.renameMatrices
#------------------------------------------------------------------------------------------------------------------------
test.convertTaxonCode = function ()
{
  print ('--- test.convertTaxonCode')

  checkEquals (convertTaxonCode ('9606'), 'Hsapiens')
  checkEquals (convertTaxonCode (9606), 'Hsapiens')
     # anomalous codes, which an examination of the jaspar website reveals as 'vertebrates'
  checkEquals (convertTaxonCode (NA), 'Vertebrata')
  checkEquals (convertTaxonCode ('NA'), 'Vertebrata')
  checkEquals (convertTaxonCode (NA_character_), 'Vertebrata')
  checkEquals (convertTaxonCode ('-'), 'Vertebrata')

} # test.convertTaxonCode
#------------------------------------------------------------------------------------------------------------------------
test.guessProteinIdentifierType = function (moleculeName)
{
  print ('--- test.guessProteinIdentifierType')
  checkEquals (guessProteinIdentifierType ('P29383'), 'UNIPROT')

  all.types = sapply (x.tbl.anno$proteinID, guessProteinIdentifierType)
  checkTrue (length (which (is.na (all.types))) < 12)   # got most of them.

} # test.guessProteinIdentifierType
#------------------------------------------------------------------------------------------------------------------------
test.normalizeMatrices = function (matrices)
{
  print ('--- test.normalizeMatrices')

  colsums = as.integer (sapply (matrices, function (mtx) as.integer (mean (round (colSums (mtx))))))
  #checkTrue (all (colsums > 1))

  matrices.norm = normalizeMatrices (matrices)

  colsums = as.integer (sapply (matrices.norm, function (mtx) as.integer (mean (round (colSums (mtx))))))
  checkTrue (all (colsums == 1))

  invisible (matrices.norm)

} # test.normalizeMatrices
#------------------------------------------------------------------------------------------------------------------------
test.assignGeneId = function (proteinId)
{
  print ('--- test.assignGeneId')
  uniprot.ids = c ('Q9GRA5', 'P31314', 'AAC18941', 'O49397')
  refseq.ids  = c ('NP_995315.1', 'NP_032840', 'NP_599022')
  yeast.ids   = c ('YKL112W', 'YMR072W', 'YLR131C')

  checkEquals (assignGeneId ('NP_995315.1'), list (geneId='4782', type='ENTREZ'))
  checkEquals (assignGeneId ('NP_599022'),   list (geneId='6095', type='ENTREZ'))

  checkEquals (assignGeneId ('P31314'),      list (geneId='3195', type='ENTREZ'))

  checkEquals (assignGeneId ('YKL112W'),     list (geneId='YKL112W', type='SGD'))

    # see how successful this is over all 513 proteinIds

  tbl.anno = createAnnotationTable ()
  mtx.geneId = as.data.frame (t (sapply (tbl.anno$proteinID, assignGeneId)))
  tbl.types = as.data.frame (table (as.character (mtx.geneId$type), useNA='always'), stringsAsFactors=FALSE)
  checkEquals (tbl.types$Var1, c ("ENTREZ", "SGD", NA))
  checkEquals (tbl.types$Freq, c (141, 177, 195))

} # test.assignGeneId
#------------------------------------------------------------------------------------------------------------------------
test.parsePwm = function ()
{
  print ('--- test.parsePwm')
  lines = c ('UW.Motif.0006	aggaaatg',
             '0.890585	0.007855	0.051323	0.050237',
             '0.060732	0.004506	0.894170	0.040593',
             '0.072765	0.037935	0.860704	0.028596',
             '0.929585	0.024037	0.034914	0.011464',
             '0.931220	0.023231	0.029078	0.016471',
             '0.857934	0.044211	0.072594	0.025261',
             '0.065840	0.013777	0.058013	0.862370',
             '0.049937	0.036238	0.861871	0.051953')
  m6 = parsePwm (lines)
  checkEquals (names (m6), c ("title", "consensus.sequence", "matrix"))
  pwm = m6$matrix
  checkEquals (dim (pwm), c (4, 8))
  checkEquals (rownames (pwm), c ('A', 'C', 'G', 'T'))
  invisible (m6)

} # test.parsePwm
#------------------------------------------------------------------------------------------------------------------------

PriceLab/MotifDb documentation built on March 23, 2022, 7:42 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

PriceLab/MotifDb
An Annotated Collection of Protein-DNA Binding Sequence Motifs

inst/scripts/import/stamlab/test.R
In PriceLab/MotifDb: An Annotated Collection of Protein-DNA Binding Sequence Motifs

R Package Documentation

Browse R Packages

We want your feedback!

PriceLab/MotifDb An Annotated Collection of Protein-DNA Binding Sequence Motifs

inst/scripts/import/stamlab/test.R In PriceLab/MotifDb: An Annotated Collection of Protein-DNA Binding Sequence Motifs

R Package Documentation

Browse R Packages

We want your feedback!

PriceLab/MotifDb
An Annotated Collection of Protein-DNA Binding Sequence Motifs

inst/scripts/import/stamlab/test.R
In PriceLab/MotifDb: An Annotated Collection of Protein-DNA Binding Sequence Motifs