library(trena)
library(MotifDb)
library(RUnit)
Sys.setlocale("LC_ALL", "C")
#----------------------------------------------------------------------------------------------------
printf <- function(...) print(noquote(sprintf(...)))
#sequence <- "ACCAGCATGCAAATTAGACAA"
#----------------------------------------------------------------------------------------------------
runTests <- function()
{
#profmem(test_basicConstructor())
test_getSequence()
test_.matchPwmForwardAndReverse()
test_bugInStartEndOfMinusStrandHits()
test_.parseVariantString()
test_.injectSnp()
test_getSequenceWithVariants()
test_.getScoredMotifs()
test_noMatch()
test_findMatchesByChromosomalRegion()
test_findMatchesByChromosomalRegion_contrastReferenceWithVariant()
test_findMatchesByChromosomalRegion.twoAlternateAlleles()
test_findMatchesByChromosomalRegion.yeast()
test_findMatchesByChromosomalRegion.athaliana()
} # runTests
#------------------------------------------------------------------------------------------------------------------------
test_basicConstructor <- function(reuse=FALSE)
{
if(!reuse) printf("--- test_basicConstructor")
checkException(mm <- MotifMatcher(genomeName="hg38"), silent=TRUE)
matrices <- as.list(MotifDb) # use all matrices
mm <- MotifMatcher(genomeName="hg38", as.list(MotifDb))
pfms <- getPfms(mm) # jaspar pfms used, real-time download
checkTrue(is.list(pfms))
checkTrue(is.matrix(pfms[[1]]))
checkEquals(length(pfms), length(matrices))
jaspar.human.pfms <- as.list(query(query(MotifDb, "jaspar"), "sapiens"))
motifMatcher <- MotifMatcher(genomeName="hg38", pfms=jaspar.human.pfms)
pfms <- getPfms(motifMatcher) # jaspar pfms used, real-time download
checkTrue(is.list(pfms))
checkTrue(is.matrix(pfms[[1]]))
checkEquals(length(pfms), length(jaspar.human.pfms))
} # test_basicConstructor
#----------------------------------------------------------------------------------------------------
test_.matchPwmForwardAndReverse <- function()
{
# TODO: odd windows-only bug
if(!interactive()) return(TRUE);
printf("--- test_.matchPwmForwardAndReverse")
motifName.1 <- "MA0507.1"
mtx.1 <- query(MotifDb, motifName.1)[[1]];
motifName.2 <- "MA0063.1" #"MA0627.1"
mtx.2 <- query(MotifDb, motifName.2)[[1]];
test.sequence <- "TTGTCTAATTTGCATGCTGGT"
tbl.1 <- trena:::.matchPwmForwardAndReverse(test.sequence, mtx.1, motifName.1, min.match.percentage=90, quiet=TRUE)
checkEquals(nrow(tbl.1), 1)
checkEquals(colnames(tbl.1),
c("start","end","width","score","maxScore","relativeScore","motif","match","strand"))
checkEquals(unlist(gregexpr(tbl.1$match[1], test.sequence)), tbl.1$start[1])
checkEquals(tbl.1$end, (tbl.1$start[1] + nchar(tbl.1$match)) - 1)
tbl.2 <- trena:::.matchPwmForwardAndReverse(test.sequence, mtx.2, motifName.2, min.match.percentage=60, quiet=TRUE)
checkEquals(nrow(tbl.2), 4)
checkEquals(colnames(tbl.2),
c("start","end","width","score","maxScore","relativeScore","motif","match","strand"))
# the reported match, for both + AND -, are done with forward strand sequence
# thus the same start and end tests apply to both
for(r in seq_len(nrow(tbl.2))){
checkEquals(unlist(gregexpr(tbl.2$match[r], test.sequence)), tbl.2$start[r])
checkEquals(tbl.2$end[r], (tbl.2$start[r] + nchar(tbl.2$match[r])) - 1)
} # for r
} # test_.matchForwardAndReverse
#----------------------------------------------------------------------------------------------------
test_.getScoredMotifs <- function()
{
printf("--- test_.getScoredMotifs")
seqs.3 <- test_getSequence(indirect=TRUE) # returns 3 dna sequences
jaspar.human.pfms <- as.list(query (query(MotifDb, "sapiens"), "jaspar2016"))
# multiple sequences can be passed in, with a list of tables returned.
motifs.90 <- trena:::.getScoredMotifs(seqs.3, jaspar.human.pfms, min.match.percentage=90)
# >= 90% match results only for the third sequence
checkEquals(unlist(lapply(motifs.90, nrow)), c(0, 0, 2))
motifs.75 <- trena:::.getScoredMotifs(seqs.3, jaspar.human.pfms, min.match.percentage=75)
checkEquals(unlist(lapply(motifs.75, nrow)), c(7, 0, 30))
checkEquals(colnames(motifs.75[[1]]),
c("start", "end", "width", "score", "maxScore", "relativeScore", "motif", "match", "strand"))
checkEquals(colnames(motifs.75[[3]]),
c("start", "end", "width", "score", "maxScore", "relativeScore", "motif", "match", "strand"))
motifs.perfect <- trena:::.getScoredMotifs(seqs.3, jaspar.human.pfms, min.match.percentage=100) # nigh impossible threshold
checkEquals(unlist(lapply(motifs.perfect, nrow)), c(0, 0, 0))
} # test_.getScoredMotifs
#----------------------------------------------------------------------------------------------------
#test_tfGeneSymbolForMotif <- function()
#{
# printf("--- test_tfGeneSymbolForMotif")
#
# jaspar.human.pfms <- query(query(MotifDb, "jaspar"), "sapiens") # 625 human pfms from jaspar
# mm.mdb <- MotifMatcher(genomeName="hg38", pfms=jaspar.human.pfms)
#
# # get the underlying matrix and metadata for testing
# matrix.list <- getPfms(mm.mdb)
# matrix.metadata <- getPfmMetadata(mm.mdb)
#
# set.seed(31)
# matrix.count <- length(matrix.list)
# random.indices <- sample(seq_len(matrix.count), 5)
# motif.names <- names(matrix.list)[random.indices]
#
# tbl.1 <- tfGeneSymbolForMotif(mm.mdb, motif.names[1])
# checkEquals(nrow(tbl.1), 1)
# checkEquals(tbl.1$motif, "MA0473.1")
# checkEquals(tbl.1$geneSymbol, "ELF1")
#
# tbl.5 <- tfGeneSymbolForMotif(mm.mdb, motif.names)
# checkEquals(tbl.5$motif, c("MA0473.1", "MA0865.1", "MA0103.2", "MA0101.1", "MA0855.1"))
# checkEquals(tbl.5$geneSymbol, c("ELF1", "E2F8", "ZEB1", "REL", "RXRB"))
#
# # now use the default pfms (jaspar vertebrate) and metadata (trena/extdata/motifGenes.tsv)
# mm.default <- MotifMatcher(genomeName="hg38")
#
# # 1 tf 2 tfs 6tfs
# motifs <- c("MA0038.1", "MA0615.1", "MA0803.1")
# tbl.3 <- tfGeneSymbolForMotif(mm.default, motifs)
# checkEquals(dim(tbl.3), c(9, 2))
# checkEquals(tbl.3$motif, c("MA0038.1", rep("MA0615.1",2), rep("MA0803.1", 6)))
# checkEquals(tbl.3$geneSymbol, c("GFI1", "GMEB1", "GMEB2", "TBX15", "TBX1", "TBX10", "TBX18", "TBX20", "TBX22"))
#
#} # test_tfGeneSymbolForMotif
#----------------------------------------------------------------------------------------------------
test_getSequence <- function(indirect=FALSE)
{
if(!indirect)
printf("--- test_getSequence")
mm <- MotifMatcher(genomeName="hg38", pfms=list())
chroms <- rep("chr5", 3)
starts <- c(88819700, 88820700, 88820980)
ends <- c(88819710, 88820705, 88821000)
for(i in 1:3){
tbl.regions.noSeq <- data.frame(chrom=chroms[i], start=starts[i], end=ends[i], stringsAsFactors=FALSE)
tbl.regions <- getSequence(mm, tbl.regions.noSeq)
checkEquals(colnames(tbl.regions), c("chrom", "start", "end", "seq", "status"))
seqs <- tbl.regions$seq
checkEquals(length(seqs), 1)
checkEquals(unlist(lapply(seqs, nchar)), 1 + tbl.regions$end - tbl.regions$start)
}
# --- how to get multiple sequences
tbl.regions.noSeq <- data.frame(chrom=chroms, start=starts, end=ends, stringsAsFactors=FALSE)
x <- lapply(1:nrow(tbl.regions.noSeq), function(r) getSequence(mm, tbl.regions.noSeq[r,]))
tbl.regions <- do.call(rbind, x)
invisible(tbl.regions$seq)
} # test_getSequence
#----------------------------------------------------------------------------------------------------
test_.parseVariantString <- function()
{
if(!interactive()) return () # remove need for huge SNPlocs package
mm <- MotifMatcher(genomeName="hg38", pfms=list()) # no actual pfms needed here
tbl.variant <- trena:::.parseVariantString(mm, "rs13384219")
checkEquals(dim(tbl.variant), c(1, 4))
checkEquals(tbl.variant$chrom, "chr2")
checkEquals(tbl.variant$loc, 57907323)
checkEquals(tbl.variant$wt, "A")
checkEquals(tbl.variant$mut, "G")
# the same snp, but here expressed as a string (not an rsid)
tbl.v2 <- trena:::.parseVariantString(mm, "chr2:57907323:A:G")
checkEquals(dim(tbl.v2), c(1, 4))
checkEquals(tbl.v2$chrom, "chr2")
checkEquals(tbl.v2$loc, 57907323)
checkEquals(tbl.v2$wt, "A")
checkEquals(tbl.v2$mut, "G")
# snp with two variant alleles; one must be specified
suppressWarnings(tbl.2vars <- trena:::.parseVariantString(mm, "rs3763040:A"))
checkEquals(dim(tbl.2vars), c(1,4))
checkEquals(tbl.2vars$chrom, "chr18")
checkEquals(tbl.2vars$loc, 26864410)
checkEquals(tbl.2vars$wt, "G")
checkEquals(tbl.2vars$mut, "A")
# snp with two variant alleles; one must be specified
suppressWarnings(tbl.2vars <- trena:::.parseVariantString(mm, "rs3763040:T"))
checkEquals(dim(tbl.2vars), c(1,4))
checkEquals(tbl.2vars$chrom, "chr18")
checkEquals(tbl.2vars$loc, 26864410)
checkEquals(tbl.2vars$wt, "G")
checkEquals(tbl.2vars$mut, "T")
# snp with two variant alleles; first is chosen by default
suppressWarnings(tbl.2vars <- trena:::.parseVariantString(mm, "rs3763040"))
checkEquals(dim(tbl.2vars), c(1,4))
checkEquals(tbl.2vars$chrom, "chr18")
checkEquals(tbl.2vars$loc, 26864410)
checkEquals(tbl.2vars$wt, "G")
checkEquals(tbl.2vars$mut, "A")
# now fail on purpose, explictly specifying an alt allele not include in the snp
checkException(
tbl.2vars <- trena:::.parseVariantString(mm, "rs3763040:C"),
"caught!", silent=TRUE)
# fail again on purpose, giving a bogus rsid
failure <- try(trena:::.parseVariantString(mm, "bogus"), silent=TRUE)
checkEquals(is(failure), "try-error")
} # test_.parseVariantString
#----------------------------------------------------------------------------------------------------
test_.injectSnp <- function()
{
printf("--- test_.injectSnp")
mm <- MotifMatcher(genomeName="hg38", pfms=list())
tbl.regions.noSeq <- data.frame(chrom="chr2", start=57907318, end=57907328, stringsAsFactors=FALSE)
tbl.regions <- getSequence(mm, tbl.regions.noSeq)
checkEquals(tbl.regions$seq, "CATGCAAATTA")
tbl.variants <- data.frame(chrom="chr2", loc=57907323, wt="A", mut="G", stringsAsFactors=FALSE)
tbl.regions.new <- trena:::.injectSnp(tbl.regions, tbl.variants)
checkEquals(dim(tbl.regions.new), c(1, 5))
checkEquals(tbl.regions.new$seq, "CATGCGAATTA")
checkEquals(tbl.regions.new$status, "mut")
#checkEquals(tbl.regions.new$alt, "chr2:57907323(A->G)")
#---- a snp with two alternate alleles: rs3763040
# tbl.regions.noSeq <- data.frame(chrom="chr18", start=26864405, end=26864415, stringsAsFactors=FALSE)
# tbl.regions <- getSequence(mm, tbl.regions.noSeq)
# checkEquals(tbl.regions$seq, "ACAGGGGGGTG")
# checkEquals(tbl.regions$status, "wt")
#
# tbl.variants <- data.frame(chrom=rep("chr18",2), loc=rep(26864410,2), wt=rep("G",2) , mut=c("A", "T"),stringsAsFactors=FALSE)
# tbl.regions.new <- trena:::.injectSnp(tbl.regions, tbl.variants)
# checkEquals(dim(tbl.regions.new), c(2, 5))
# # |
# checkEquals(tbl.regions.new$seq, c("ACAGGAGGGTG",
# "ACAGGTGGGTG"))
# checkEquals(tbl.regions.new$alt, c("chr18:26864410(G->A)",
# "chr18:26864410(G->T)"))
#
# # that same snp but with a region that does not containt it
# tbl.regions <- data.frame(chrom="chr2", start=26864405, end=26864415, seq="ACAGGGGGGTG",stringsAsFactors=FALSE)
# tbl.variants <- data.frame(chrom=rep("chr18",2), loc=rep(26864410,2), wt=rep("G",2) , mut=c("A", "T"),stringsAsFactors=FALSE)
# tbl.regions.new <- trena:::.injectSnp(tbl.regions, tbl.variants)
# checkEquals(tbl.regions, tbl.regions.new)
#
# # now with one overlapping region, two non-overlapping
# tbl.regions <- data.frame(chrom="chr18",
# start=c(26864305, 26864405, 26864505),
# end= c(26864315, 26864415, 26864515),
# seq=c("CTAGCCCTTAG", "ACAGGGGGGTG","CTCTAGAGGAA"),
# stringsAsFactors=FALSE)
# tbl.variants <- data.frame(chrom=rep("chr18",2), loc=rep(26864410,2), wt=rep("G",2) , mut=c("A", "T"),stringsAsFactors=FALSE)
# tbl.regions.new <- trena:::.injectSnp(tbl.regions, tbl.variants)
# checkEquals(dim(tbl.regions.new), c(4, 5))
# checkEquals(tbl.regions.new$chrom, rep("chr18", 4))
# checkEquals(tbl.regions.new$start, c(26864305, 26864405, 26864405, 26864505))
# checkEquals(tbl.regions.new$end, c(26864315, 26864415, 26864415, 26864515))
# checkEquals(tbl.regions.new$seq, c("CTAGCCCTTAG", "ACAGGAGGGTG", "ACAGGTGGGTG", "CTCTAGAGGAA"))
# checkEquals(tbl.regions.new$alt, c("wt", "chr18:26864410(G->A)", "chr18:26864410(G->T)", "wt"))
# now try two variants in one region, different locations
mm <- MotifMatcher(genomeName="hg38", pfms=list())
tbl.variants <- data.frame(chrom=c("chr18", "chr18"),
loc=c(26865466, 26865469),
wt=c("G", "T"),
mut=c("C", "C"),
stringsAsFactors=FALSE)
tbl.regions.noSeq <- data.frame(chrom="chr18", start=26865463, end=26865472, stringsAsFactors=FALSE)
tbl.regions <- getSequence(mm, tbl.regions.noSeq)
tbl.regions.new <- trena:::.injectSnp(tbl.regions, tbl.variants)
# | |
checkEquals(tbl.regions$seq, "AAAGCATCCC")
checkEquals(tbl.regions.new$seq, "AAACCACCCC")
} # test_.injectSnp
#----------------------------------------------------------------------------------------------------
# rs13384219 A->G
# gtcagtagtggtggaaccagcatgc[A/G]aattagacaatgtgacttcatagcc
# Chromosome: 2:57907323
# chr2:57907323:A:G
test_getSequenceWithVariants <- function()
{
if(!interactive()) return()
printf("--- test_getSequenceWithVariants")
mm <- MotifMatcher(genomeName="hg38", pfms=list())
chroms <- "chr2"
starts <- 57907318
ends <- 57907328
tbl.regions.noSeq <- data.frame(chrom=chroms, start=starts, end=ends, stringsAsFactors=FALSE)
tbl.wt <- getSequence(mm, tbl.regions.noSeq)
tbl.mut <- getSequence(mm, tbl.regions.noSeq, "rs13384219")
checkEquals(tbl.wt$status, "wt")
checkEquals(tbl.mut$status, "mut") #"chr2:57907323(A->G)")
checkEquals(tbl.wt$seq, "CATGCAAATTA")
checkEquals(tbl.mut$seq, "CATGCGAATTA")
# now a snp with two variants
chromosome <- "chr18"
loc <- 26864410
tbl.regions.noSeq <- data.frame(chrom=chromosome, start=loc-5, end=loc+5, stringsAsFactors=FALSE)
rsid <- "rs3763040"
tbl.wt <- getSequence(mm, tbl.regions.noSeq)
checkEquals(dim(tbl.wt), c(1, 5))
checkEquals(tbl.wt$seq, "ACAGGGGGGTG")
checkEquals(tbl.wt$status, "wt")
suppressWarnings(tbl.mut <- getSequence(mm, tbl.regions.noSeq, rsid))
checkEquals(dim(tbl.mut), c(1, 5))
checkEquals(tbl.mut$seq, "ACAGGAGGGTG")
checkEquals(tbl.mut$status, "mut") # c("chr18:26864410(G->A)", "chr18:26864410(G->T)"))
tbl.mut <- getSequence(mm, tbl.regions.noSeq, "rs3763040:A")
checkEquals(dim(tbl.mut), c(1, 5))
checkEquals(tbl.mut$seq, "ACAGGAGGGTG")
checkEquals(tbl.mut$status, "mut") # c("chr18:26864410(G->A)", "chr18:26864410(G->T)"))
tbl.mut <- getSequence(mm, tbl.regions.noSeq, "rs3763040:T")
checkEquals(dim(tbl.mut), c(1, 5))
checkEquals(tbl.mut$seq, "ACAGGTGGGTG")
checkEquals(tbl.mut$status, "mut") # c("chr18:26864410(G->A)", "chr18:26864410(G->T)"))
# now provide two variants (at two locations), here just three bases apart
# rs750694782: chr18:26865466
# rs3875089: chr18:26865469
variants <- c("rs750694782", "rs3875089")
tbl.regions.noSeq <- data.frame(chrom="chr18", start=26865463, end=26865472, stringsAsFactors=FALSE)
tbl.wt <- getSequence(mm, tbl.regions.noSeq)
checkEquals(as.list(tbl.wt[, c("seq", "status")]), list(seq="AAAGCATCCC", status="wt"))
tbl.mut <- getSequence(mm, tbl.regions.noSeq, variants) # | |
checkEquals(as.list(tbl.mut[, c("seq", "status")]), list(seq="AAACCACCCC", status="mut"))
} # test_getSequenceWithVariants
#----------------------------------------------------------------------------------------------------
test_noMatch <- function()
{
printf("--- test_noMatch")
jaspar.human.pfms <- as.list(query (query(MotifDb, "sapiens"), "jaspar2016"))
motifMatcher <- MotifMatcher(genomeName="hg38", pfms=jaspar.human.pfms, quiet=TRUE)
tbl.regions <- data.frame(chrom="chr2", start=57907313, end=57907314, stringsAsFactors=FALSE)
tbl.hits <- findMatchesByChromosomalRegion(motifMatcher, tbl.regions, pwmMatchMinimumAsPercentage=92)
checkEquals(dim(tbl.hits), c(0,0))
} # test_noMatch
#----------------------------------------------------------------------------------------------------
test_findMatchesByChromosomalRegion <- function()
{
printf("--- test_findMatchesByChromosomalRegion")
jaspar.human.pfms <- as.list(query (query(MotifDb, "sapiens"), "jaspar2016"))
motifMatcher <- MotifMatcher(genomeName="hg38", pfms=jaspar.human.pfms, quiet=TRUE)
tbl.regions <- data.frame(chrom="chr2", start=57907313, end=57907333, stringsAsFactors=FALSE)
tbl.hits <- findMatchesByChromosomalRegion(motifMatcher, tbl.regions, pwmMatchMinimumAsPercentage=92)
checkEquals(dim(tbl.hits), c(5, 13))
checkEquals(unique(tbl.hits$status), "wt")
# the best match (longest, all bases in agreement with the motif:
# http://jaspar.genereg.net/cgi-bin/jaspar_db.pl?rm=present&collection=CORE&ID=MA0792.1
best <- as.list(tbl.hits[1,])
checkEquals(best$motifName, "Hsapiens-jaspar2016-POU5F1B-MA0792.1")
checkEquals(best$chrom, "chr2")
checkEquals(best$motifStart, 57907318)
checkEquals(best$motifEnd, 57907326)
checkEquals(best$strand, "+")
checkEqualsNumeric(best$motifScore, 6.707832)
checkEqualsNumeric(best$motifRelativeScore, 0.9208761)
checkEquals(best$match, "CATGCAAAT")
checkEquals(best$chromStart, 57907313)
checkEquals(best$chromEnd, 57907333)
checkEquals(best$seq, "ACCAGCATGCAAATTAG...")
checkEquals(best$status, "wt")
} # test_findMatchesByChromosomalRegion
#----------------------------------------------------------------------------------------------------
test_findMatchesByChromosomalRegion_contrastReferenceWithVariant <- function()
{
if(!interactive()) return()
printf("--- test_findMatchesByChromosomalRegion_contrastReferenceWithVariant")
# the vrk2 promoter snp, chr2:57907313-57907333
jaspar.human.pfms <- as.list(query (query(MotifDb, "sapiens"), "jaspar2016"))
motifMatcher <- MotifMatcher(genomeName="hg38", pfms=jaspar.human.pfms, quiet=TRUE)
tbl.regions <- data.frame(chrom="chr2", start=57907313, end=57907333, stringsAsFactors=FALSE)
tbl.wt <- findMatchesByChromosomalRegion(motifMatcher, tbl.regions, pwmMatchMinimumAsPercentage=92)
tbl.mut <- findMatchesByChromosomalRegion(motifMatcher, tbl.regions, pwmMatchMinimumAsPercentage=92, "rs13384219")
tbl <- rbind(tbl.wt[, c(1,12, 2,3,4,5,7,8)], tbl.mut[, c(1,12, 2,3,4,5,7,8)])
tbl <- tbl[order(tbl$motifName, tbl$motifRelativeScore, decreasing=TRUE),]
motifs.in.both <- tbl$motifName[which(duplicated(tbl$motifName))]
tbl.lost.in.mut <- subset(tbl, !motifName %in% motifs.in.both)
# two wt motifs lost, not found in mut:
#
motifs.only.in.wt <- c("Hsapiens-jaspar2016-POU5F1B-MA0792.1", "Hsapiens-jaspar2016-LHX9-MA0701.1")
checkTrue(all(motifs.only.in.wt %in% tbl.lost.in.mut$motifName))
checkEquals(unique(tbl.lost.in.mut$status), "wt")
# now try again with a more permissive matching threshold
tbl.wt <- findMatchesByChromosomalRegion(motifMatcher, tbl.regions, pwmMatchMinimumAsPercentage=80)
tbl.mut <- findMatchesByChromosomalRegion(motifMatcher, tbl.regions, pwmMatchMinimumAsPercentage=80, "rs13384219")
tbl <- rbind(tbl.wt[, c(1,12, 2,3,4,5,7,8)], tbl.mut[, c(1,12, 2,3,4,5,7,8)])
tbl <- tbl[order(tbl$motifName, tbl$motifRelativeScore, decreasing=TRUE),]
# the POU5F1B motif MA0792.1 drops from a 92% to an 83% match.
# assessing the relevance of this will be up to the biologist...
checkEqualsNumeric(subset(tbl, motifName=="Hsapiens-jaspar2016-POU5F1B-MA0792.1")$motifRelativeScore, c(0.92, 0.83), tol=1e-2)
motifs.in.both <- tbl$motifName[which(duplicated(tbl$motifName))]
tbl.summary <- subset(tbl, !motifName %in% motifs.in.both)
# two possible new binding motifs: MA0891.1, MA0648.1
tbl.deNovoMut <- subset(tbl.summary, motifRelativeScore > 0.85)
checkEquals(sort(tbl.deNovoMut$motifName),
c("Hsapiens-jaspar2016-GSC-MA0648.1", "Hsapiens-jaspar2016-GSC2-MA0891.1"))
checkEquals(unique(tbl.deNovoMut$status), "mut")
} # test_findMatchesByChromosomalRegion_contrastReferenceWithVariant
#----------------------------------------------------------------------------------------------------
test_findMatchesByMultipleChromosomalRegions <- function()
{
printf("--- test_findMatchesByMultipleChromosomalRegions")
# 442 explicitly human pfms from the JASPAR2016.
# as.list carves out just the matrices (pfms) omitting metadata
pfms <- as.list(query(query(MotifDb, "jaspar2016"), "sapiens"))
motifMatcher <- MotifMatcher(genomeName="hg38", pfms)
# 21 bases on chr2, 21 bases on chr18
tbl.regions <- data.frame(chrom=c("chr2", "chr18"),
start=c(57907313, 26864400),
end =c(57907333, 26864420),
stringsAsFactors=FALSE)
tbl.hits <- findMatchesByChromosomalRegion(motifMatcher, tbl.regions, pwmMatchMinimumAsPercentage=93)
m1 <- as.list(tbl.hits[1,])
checkEquals(m1$motifName, "Hsapiens-jaspar2016-SHOX-MA0630.1")
checkEquals(m1$chrom, "chr2")
checkEquals(m1$motifStart, 57907322)
checkEquals(m1$motifEnd, 57907329)
checkEquals(m1$strand, "-")
checkEqualsNumeric(m1$motifScore, 5.23, tol=1e-2)
checkEqualsNumeric(m1$motifRelativeScore, 0.93, tol=1e-2)
checkEquals(m1$match, "CAAATTAG")
checkEquals(m1$chromStart, 57907313)
checkEquals(m1$chromEnd, 57907333)
checkEquals(m1$seq, "ACCAGCATGCAAATTAG...")
checkEquals(m1$status, "wt")
m2 <- as.list(tbl.hits[2,])
checkEquals(m2$motifName, "Hsapiens-jaspar2016-ZNF354C-MA0130.1")
checkEquals(m2$chrom, "chr18")
checkEquals(m2$motifStart, 26864401)
checkEquals(m2$motifEnd, 26864406)
checkEquals(m2$strand, "+")
checkEquals(m2$motifScore, 5, tol=1e-2)
checkEquals(m2$motifRelativeScore, 0.98, tol=1e-2)
checkEquals(m2$match, "CTCCAC")
checkEquals(m2$chromStart, 26864400)
checkEquals(m2$chromEnd, 26864420)
checkEquals(m2$seq, "GCTCCACAGGGGGGTGG...")
checkEquals(m2$status, "wt")
# now repeat with a looser match threshold
tbl.hits2 <- findMatchesByChromosomalRegion(motifMatcher, tbl.regions, pwmMatchMinimumAsPercentage=90)
tbl.freq <- as.data.frame(table(tbl.hits2$chrom))
# one match each found for these two regions at the stringent match threshold
checkEquals(dim(tbl.hits2), c(18, 13))
# two motifs from chr18, the rest from chr2
checkEquals(nrow(subset(tbl.hits2, chrom=="chr18")), 1)
checkEquals(nrow(subset(tbl.hits2, chrom=="chr2")), 17)
} # test_findMatchesByMultipleChromosomalRegions
#----------------------------------------------------------------------------------------------------
test_findMatchesByChromosomalRegion.twoAlternateAlleles <- function()
{
printf("--- test_findMatchesByChromosomalRegion.twoAlternateAlleles")
# rsid <- "rs3763040" # 18:26864410 A/C/T (T/G/A)
# getSequence(mm, data.frame(chrom="chr18", start=26864410, end=26864410))[1] "G"
# |
# chr18:26864405-26864415 ACAGGGGGGTG
# thus variants must be (wrt to + strand) T and A
# target.gene <- "APQR"
# genome <- "hg38"
chromosome <- "chr18"
loc <- 26864410
jaspar.human.pfms <- as.list(query (query(MotifDb, "sapiens"), "jaspar2016"))
mm <- MotifMatcher(genomeName="hg38", pfms=jaspar.human.pfms)
tbl.regions <- data.frame(chrom=chromosome, start=loc-6, end=loc+3, stringsAsFactors=FALSE)
tbl.wt <- findMatchesByChromosomalRegion(mm, tbl.regions, pwmMatchMinimumAsPercentage=80L)
tbl.mut <- findMatchesByChromosomalRegion(mm, tbl.regions, pwmMatchMinimumAsPercentage=80L, "chr18:26864410:G:T")
checkEquals(unique(tbl.wt$seq), "CACAGGGGGG")
checkEquals(unique(tbl.mut$seq), "CACAGGTGGG")
# two motifs are shared
motifs.shared <- sort(intersect(tbl.wt$motifName, tbl.mut$motifName))
checkEquals(motifs.shared, c("Hsapiens-jaspar2016-TCF3-MA0522.2", "Hsapiens-jaspar2016-TCF4-MA0830.1"))
# the snp increases the scores of these two
checkEqualsNumeric(subset(tbl.wt, motifName %in% motifs.shared[1])$motifScore, 5.79, tol=1e-2)
checkEqualsNumeric(subset(tbl.mut, motifName %in% motifs.shared[1])$motifScore, 6.79, tol=1e-2)
# MA0522.2, TCF3, class: bHLH, family: E2A-related factors, reverse complement, from JASPAR
# relative score motif score
# wt: C A C A G G G G G G 0.8016 5.79
# mut: C A C A G G T G G G 0.9394 6.79
# A [2247 2554 5 7799 0 317 19 0 213 1839 ]
# C [2093 1978 7799 7 465 1251 0 0 2842 1014 ]
# G [1393 3014 21 35 7799 7799 5 7799 1321 2123 ]
# T [2066 253 27 43 0 0 7799 0 3423 2823 ]
checkEqualsNumeric(subset(tbl.wt, motifName %in% motifs.shared[2])$motifScore, 5.89, tol=1e-2)
checkEqualsNumeric(subset(tbl.mut, motifName %in% motifs.shared[2])$motifScore, 6.89, tol=1e-2)
# MA0830.1, TCF4, class: bHLH, family: E2A-related factors, reverse complement, from JASPAR
# relative score motif score
# wt: C A C A G G G G G G 0.8047 5.89
# mut: C A C A G G T G G G 0.9412 6.89
# A [ 5380 6301 25 20335 0 1334 0 13 409 5185 ]
# C [ 4838 6324 20335 0 387 2315 0 0 9640 2128 ]
# G [ 4812 7336 5 0 20335 20335 0 20335 2465 8284 ]
# T [ 5305 374 27 117 56 0 20335 6 7822 4738 ]
# motifs lost and gained
motifs.lost <- sort(setdiff(tbl.wt$motifName, tbl.mut$motifName))
motifs.gained <- sort(setdiff(tbl.mut$motifName, tbl.wt$motifName))
checkEquals(motifs.lost, "Hsapiens-jaspar2016-MZF1-MA0056.1")
# MA0056.1, MZF1, class: C2H@ zinc finger factors, family: more than 3 adjacent zinc finger factors, from JASPAR
# relative score motif score
# wt: G G G G G G 0.8039 4.10
# A [ 3 0 2 0 0 18 ]
# C [ 5 0 0 0 0 0 ]
# G [ 4 19 18 19 20 2 ]
# T [ 8 1 0 1 0 0 ]
checkEquals(motifs.gained, c("Hsapiens-jaspar2016-FIGLA-MA0820.1",
"Hsapiens-jaspar2016-ID4-MA0824.1" ,
"Hsapiens-jaspar2016-SNAI2-MA0745.1",
"Hsapiens-jaspar2016-USF1-MA0093.1"))
# look at (for now( just one of the gain motifs
# MA0745.1, SNAi2, class C2H2 zinc finger factors, more than 3 adjacent zinc finger factors
# relative score motif score
# mut: C A C A G G T G G 0.9516795 6.199955
# A [ 37982 56979 5150 110706 16150 361 57 16003 9858 ]
# C [ 15746 4756 110706 1908 1458 0 525 5615 37564 ]
# G [ 35246 53727 2678 784 110706 110706 0 110706 25335 ]
# T [ 21733 16594 4022 3454 3864 323 110706 21327 37949 ]
checkEquals(sort(unique(tbl.mut$motifName)),
c("Hsapiens-jaspar2016-FIGLA-MA0820.1",
"Hsapiens-jaspar2016-ID4-MA0824.1" ,
"Hsapiens-jaspar2016-SNAI2-MA0745.1",
"Hsapiens-jaspar2016-TCF3-MA0522.2" ,
"Hsapiens-jaspar2016-TCF4-MA0830.1" ,
"Hsapiens-jaspar2016-USF1-MA0093.1"))
# two low-scoring ~80% matches to the sequence with C substituted
# |
# CACAGGTGGG
checkEquals(length(grep("CAGGTG", tbl.mut$match)), nrow(tbl.mut))
# now substitute, not G->T, but G->A
# this results in the loss of the GGGGGG motif from the wildtype, but the consequences
# for the other two wt motifs are undetectable in the scores: both wt and this mutant
# provide a poor match to the T expected at that position
tbl <- findMatchesByChromosomalRegion(mm, tbl.regions, pwmMatchMinimumAsPercentage=80L, "chr18:26864410:G:A")
checkEqualsNumeric(tbl$motifRelativeScore[1], 0.805, tol=10e-2)
checkEqualsNumeric(tbl$motifRelativeScore[2], 0.802, tol=10e-2)
# matches to the core of the sequence with T replaced by A
# |
# AGGAGG
# all four
checkEquals(length(grep("AGGAGG", tbl$match)), nrow(tbl))
} # test_findMatchesByChromosomalRegion.twoAlternateAlleles
#----------------------------------------------------------------------------------------------------
test_findMatchesByChromosomalRegion.yeast <- function()
{
printf("--- test_findMatchesByChromosomalRegion.yeast")
pfms <- as.list(jaspar.yeast.pfms <- query(MotifDb, c("cerevisiae", "jaspar2018")))
mm <- MotifMatcher(genomeName="saccer3", pfms)
tbl.regions <- data.frame(chrom="chrI", start=71287, end=71886, stringsAsFactors=FALSE)
tbl.motifs <- findMatchesByChromosomalRegion(mm, tbl.regions, pwmMatchMinimumAsPercentage=95L)
dim(tbl.motifs)
checkTrue(nrow(tbl.motifs) > 20 & nrow(tbl.motifs) < 30)
checkEquals(ncol(tbl.motifs), 13)
tfs <- sort(mcols(MotifDb[unique(tbl.motifs$motifName)])$geneSymbol)
some.expected <- c("ABF2", "ACE2", "ARG80", "FZF1", "HAL9", "HAP2")
checkTrue(all(some.expected %in% tfs))
} # test_findMatchesByChromosomalRegion.yeast
#----------------------------------------------------------------------------------------------------
test_findMatchesByChromosomalRegion.athaliana <- function()
{
printf("--- test_findMatchesByChromosomalRegion.athaliana")
pfms <- as.list(query(MotifDb, c("athaliana", "jaspar2018")))
mm <- MotifMatcher(genomeName="tair10", pfms)
tbl.regions <- data.frame(chrom="Chr3", start=20438600, end=20438640, stringsAsFactors=FALSE)
seq <- getSequence(mm, tbl.regions)
# chrom start end seq status
# Chr3 20438539 20438639 CTAAAAATAGGAGAAATAAGTGCAATTTCAAAGGAGACGTTAATGGCAAGTTTTTCTCTTCCAAAATTGACTAGTCGATGATGTTAATTGTTAATGTTGTC wt
tbl.motifs <- findMatchesByChromosomalRegion(mm, tbl.regions, pwmMatchMinimumAsPercentage=90L)
dim(tbl.motifs)
checkTrue(nrow(tbl.motifs) > 5 & nrow(tbl.motifs) < 10)
checkEquals(ncol(tbl.motifs), 13)
tfs <- sort(mcols(MotifDb[unique(tbl.motifs$motifName)])$geneSymbol)
some.expected <- c("AGL42", "WRKY40","WRKY48", "WRKY75", "WRKY8")
checkTrue(all(some.expected %in% tfs))
} # test_findMatchesByChromosomalRegion.athaliana
#----------------------------------------------------------------------------------------------------
# after finding and fixing and testing the start/end calculation for minus strand hits
# in .matchPwmForwardAndReverse, this exploratory function now tests that those results
# truly show up in higher level calling functions
test_bugInStartEndOfMinusStrandHits <- function()
{
printf("--- test_bugInStartEndOfMinusStrandHits")
tbl.regions.a <- data.frame(chrom="chr5", start=134115542, end=134115562, stringsAsFactors=FALSE)
tbl.regions.b <- data.frame(chrom="chr5", start=134115542, end=134115572, stringsAsFactors=FALSE)
pfms <- as.list(query(query(MotifDb, "sapiens"), "jaspar2016"))
mm <- MotifMatcher(genomeName="hg38", pfms=pfms)
tbl.a <- findMatchesByChromosomalRegion(mm, tbl.regions.a, pwmMatchMinimumAsPercentage=94)
tbl.b <- findMatchesByChromosomalRegion(mm, tbl.regions.b, pwmMatchMinimumAsPercentage=94)
# tbl.b should have everything in tbl.a
matches <- grep(tbl.a$motifName, tbl.b$motifName)
checkEquals(length(matches), nrow(tbl.a))
otherHit.indices <- setdiff(1:nrow(tbl.b), matches)
# all the other (new) matches, found in tbl.b but not tbl.a, must end AFTER the end tbl.regions.a
checkTrue(all(tbl.b$motifEnd[otherHit.indices] > tbl.regions.a$end))
# not likely -- unless the motif is VERY wide - that any of these new matches in tbl.b
# start before the single match in tbl.a
checkTrue(all(tbl.b$motifStart[otherHit.indices] > tbl.regions.a$motifStart))
} # test_bugInStartEndOfMinusStrandHits
#----------------------------------------------------------------------------------------------------
if(!interactive())
runTests()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.