bulk/chr19-demo/fimoBatchTools.R

library(RUnit)
library(GenomicRanges)
library(org.Hs.eg.db)
library(BSgenome.Hsapiens.UCSC.hg38)
library(BSgenome.Hsapiens.UCSC.hg19)
library(Biostrings)
library(MotifDb)
#------------------------------------------------------------------------------------------------------------------------
runTests <- function()
{
  test_createFastaFileForFimo()
  test_runFimo()
  test_fixMotifNamesTruncatedAt100characters()
  test_expandFimoTable()
  test_fimoBatch()

} # runTests
#------------------------------------------------------------------------------------------------------------------------
createFastaFileForFimo <- function(tbl.regions, fastaFileName, genome)
{
   stopifnot(genome %in% c("hg38", "hg19"))

   if(genome == "hg38")
     sequences <- with(tbl.regions, getSeq(BSgenome.Hsapiens.UCSC.hg38, chrom, start, end))
   if(genome == "hg19")
     sequences <- with(tbl.regions, getSeq(BSgenome.Hsapiens.UCSC.hg19, chrom, start, end))

   if(is(sequences, "DNAString"))
       sequences <- DNAStringSet(sequences)
   names(sequences) <- with(tbl.regions, sprintf("%s:%d-%d", chrom, start, end))
   message(sprintf("creating fasta file '%s' for %d sequences, for FIMO", fastaFileName, length(sequences)))

   writeXStringSet(sequences, fastaFileName)

} # createFastaFileForFimo
#------------------------------------------------------------------------------------------------------------------------
test_createFastaFileForFimo <- function()
{
   message(noquote(sprintf("--- test_createFastaFileForFimo")))

     # a < 1kb region in the promoter of GATA2, where TBX15 hits may be found
   tbl.regions <- data.frame(chrom="chr3", start=128497569, end=128498329, stringsAsFactors=FALSE)
   createFastaFileForFimo(tbl.regions, "smallTest.fa", "hg38")
   checkTrue(file.exists("smallTest.fa"))
   x <- readDNAStringSet("smallTest.fa")
   checkEquals(length(x), 1)

   tbl.regions.2 <- data.frame(chrom=c("chr3", "chr4"),
                               start=c(128497569, 128498569),
                               end=  c(128498329, 128498589),
                               stringsAsFactors=FALSE)
   createFastaFileForFimo(tbl.regions.2, "smallTest2.fa", "hg19")
   x <- readDNAStringSet("smallTest2.fa")
   checkEquals(length(x), 2)

} # test_createFastaFileForFimo
#------------------------------------------------------------------------------------------------------------------------
runFimo <- function(fastaFileName, resultsDirectory, threshold=1e-4,
                    pwmFile="~/github/fimoService/pfms/human-jaspar2018-hocomoco-swissregulon.meme")
{
   # printf("--- running FIMO")
   FIMO <- file.path(Sys.getenv("HOME"), "meme", "bin", "fimo")  # true on both hagfish & khaleesi
   MOTIFS <- pwmFile
   #cmd <- sprintf("%s --oc %s --thresh -%f --text --verbosity 1 %s %s",
   #               FIMO, resultsDirectory, threshold, MOTIFS, fastaFileName)

   base.name <- basename(fastaFileName)
   tsv.name <- sub(".fa", ".tsv", base.name, fixed=TRUE)
   tsv.path <- file.path(resultsDirectory, tsv.name)

   cmd <- sprintf("%s --thresh %f --verbosity 1 --text %s %s > %s",
                  FIMO, threshold, MOTIFS, fastaFileName, tsv.path)

   print(cmd)
   system(cmd)
   return(tsv.path)
   # /users/pshannon/meme/bin/fimo --oc . --thresh 1e-4 ~/github/fimoService/pfms/human-jaspar2018-hocomoco-swissregulon.meme chr11-small.fa

} # runFimo
#------------------------------------------------------------------------------------------------------------------------
test_runFimo <- function()
{
   message(noquote(sprintf("--- test_runFimo")))

   fastaFileName <- "smallTest.fa"  # created in test_createFastaFileforFimo
   checkTrue(file.exists(fastaFileName))
   resultsDirectory <- tempdir()
   resultsFile <- runFimo(fastaFileName, resultsDirectory, threshold=1e-2)
   checkTrue(file.exists(resultsFile))

} # test_runFimo
#------------------------------------------------------------------------------------------------------------------------
runFimoGATA2.big <- function()
{
      # GATA2 and the full span of all enhancers
   tbl.regions <- data.frame(chrom="chr3", start=128013674, end=128712841, stringsAsFactors=FALSE)
      # only 263kb around GATA2's enhancers
   tbl.regions <- data.frame(chrom="chr3", start=128383794, end=128647775, stringsAsFactors=FALSE)
     # 1kb region for faster testing
   start.loc <- 128383794
   end.loc <- start.loc + 1000
   tbl.regions <- data.frame(chrom="chr3", start=start.loc, end=end.loc, stringsAsFactors=FALSE)
   with(tbl.regions, printf("span: %d", end-start))

   resultsDirectory <- "tmp-fimo-out-3"

   if(!dir.exists(resultsDirectory))
     dir.create (resultsDirectory)

   fastaFilename <- file.path(resultsDirectory, "test.fa")
   createFastaFileForFimo(tbl.regions, fastaFilename, "hg38")
   checkTrue(file.exists(fastaFilename))
   checkTrue(file.size(fastaFilename) > with(tbl.regions, end-start))  # bigger than the base count


      #  4 minutes for 1e-4, 263k    201090 lines
      #  5 minutes for 1e-3, 263k   1510766 lnes
      # 18 minutes for 1e-2, 263k  12033436 lines

   system.time(runFimo(fastaFilename, resultsDirectory, threshold=1e-5))
   fimo.results.file <- file.path(resultsDirectory, "test.tsv")
   checkTrue(file.exists(fimo.results.file))

   tbl <- read.table(fimo.results.file, sep="\t", as.is=TRUE, nrow=-1, header=TRUE)  # two chopped names
   tbl.fixed <- fixMotifNamesTruncatedAt100characters(tbl)
   fimo.results.file.fixed <- file.path(resultsDirectory, "fimo-fixed.tsv")
   write.table(tbl.fixed, fimo.results.file.fixed, sep="\t", quote=FALSE, row.names=FALSE)

} # runFimoGATA2.big
#------------------------------------------------------------------------------------------------------------------------
fixMotifNamesTruncatedAt100characters <- function(tbl)
{
   char.100.lines <- which(nchar(tbl$motif_id) == 100)
   printf("found %d 100 character lines", length(char.100.lines))
   if(length(char.100.lines) == 0)
      invisible(tbl)

   motifDb.indices <- lapply(tbl$motif_id[char.100.lines], function(id) grep(id, names(MotifDb)))
   motifDb.names <- names(MotifDb)[as.integer(motifDb.indices)]
   tbl$motif_id[char.100.lines] <- motifDb.names

   invisible(tbl)

} # fixMotifNamesTruncatedAt100characters
#------------------------------------------------------------------------------------------------------------------------
test_fixMotifNamesTruncatedAt100characters <- function()
{
   printf("--- test_fixMotifNamesTruncatedAt100characters")
   #  a couple of instances of one long motif name:
   #  128 Mmusculus;Rnorvegicus;Xlaevis;Stropicalis;Ggallus;Hsapiens;Btaurus;Ocuniculus-jaspar2018-NFYA-MA0060           NA chr3:128383794-128647775 104174 104189      + 16.1461 1.29e-06      NA GCCAGCCAATCAACGC
   #  129 Mmusculus;Rnorvegicus;Xlaevis;Stropicalis;Ggallus;Hsapiens;Btaurus;Ocuniculus-jaspar2018-NFYA-MA0060           NA chr3:128383794-128647775 104210 104225      - 16.9888 3.79e-07      NA CGCAGCCAATGGGAGG

   start.loc <- 128383794 + 104170
   end.loc <- start.loc + 30
   tbl.regions <- data.frame(chrom="chr3", start=start.loc, end=end.loc, stringsAsFactors=FALSE)

   resultsDirectory <- "tmp-fimo-out-3"

   if(!dir.exists(resultsDirectory))
     dir.create (resultsDirectory)

   fastaFilename <- file.path(resultsDirectory, "test.fa")
   createFastaFileForFimo(tbl.regions, fastaFilename, "hg38")
   checkTrue(file.exists(fastaFilename))
   checkTrue(file.size(fastaFilename) > with(tbl.regions, end-start))  # bigger than the base count


      #  4 minutes for 1e-4, 263k    201090 lines
      #  5 minutes for 1e-3, 263k   1510766 lnes
      # 18 minutes for 1e-2, 263k  12033436 lines

   system.time(runFimo(fastaFilename, resultsDirectory, threshold=1e-5))
   fimo.results.file <- file.path(resultsDirectory, "test.tsv")
   checkTrue(file.exists(fimo.results.file))

   tbl <- read.table(fimo.results.file, sep="\t", as.is=TRUE, nrow=-1, header=TRUE)  # two chopped names
      # before fimo 5.0.5,  motif_id names were truncated at 100 characers.  no longer a problem (27 sep 2019)
   checkTrue(max(nchar(tbl$motif_id)) > 100)
     #------------------------------------------------------------
     # tests and fix no longer needed:
     #------------------------------------------------------------
     #  checkEquals(tbl$motif_id[1],
     #              "Mmusculus;Rnorvegicus;Xlaevis;Stropicalis;Ggallus;Hsapiens;Btaurus;Ocuniculus-jaspar2018-NFYA-MA0060")
     #  tbl.fixed <- fixMotifNamesTruncatedAt100characters(tbl)
     #  checkEquals(tbl.fixed$motif_id[1],
     #              "Mmusculus;Rnorvegicus;Xlaevis;Stropicalis;Ggallus;Hsapiens;Btaurus;Ocuniculus-jaspar2018-NFYA-MA0060.1")


} # test_fixMotifNamesTruncatedAt100characters
#------------------------------------------------------------------------------------------------------------------------
parseLocStrings <- function(locStrings)
{
   match <- regexpr("(?<chromosome>chr.*):(?<startPos>\\d+)-(?<endPos>\\d+)", locStrings, perl=TRUE)
   match.count <- length(match)
   if(match.count != length(locStrings)){
      stop("fimoBatchTools.parseLocStrings encountered unexpected sequence_name: %s", locStrings[1])
      }

   columns <-     attr(match, "capture.names")
   starts <-  as.data.frame(attr(match, "capture.start"))
   lengths <- as.data.frame(attr(match, "capture.length"))
   chroms <- substring(locStrings, starts$chromosome, starts$chromosome + lengths$chromosome -1)
   start.locs <- as.integer(substring(locStrings, starts$startPos, starts$startPos + lengths$startPos -1))
   end.locs <- as.integer(substring(locStrings, starts$endPos, starts$endPos + lengths$endPos -1))
   data.frame(chrom=chroms, start=start.locs, end=end.locs, stringsAsFactors=FALSE)

} # parseLocStrings
#------------------------------------------------------------------------------------------------------------------------
# add chrom coloumn.  adjust starts and stops.  rearrange columns
expandFimoTable <- function(tbl)
{
     # our convention is that fimo is called with a sequence_name which is the chromloc of the target sequence
     # here we depend upon that in order to add concrete chrom, start, end  locations to each match

   stopifnot("sequence_name" %in% colnames(tbl))  # this is

   tbl.locs <- parseLocStrings(tbl$sequence_name)
   number.of.distinct.chromosomes <- length(unique(tbl.locs$chrom))
   if(number.of.distinct.chromosomes > 1){
      stop("fimoBatchTools.expandFimoTable found %d chromosomes in table, only one permited",
           number.of.distinct.chromosomes)
      }
   tbl$chrom <- tbl.locs$chrom
   tbl$start <- tbl$start + tbl.locs$start
   tbl$end   <- tbl$stop + tbl.locs$start

   tfs <- unlist(lapply(tbl$motif_id, function(motifName) mcols(MotifDb[motifName])$geneSymbol))

   tbl$tf <- tfs
   coi <- c("chrom", "start", "end", "tf", "strand", "score", "p.value", "matched_sequence", "motif_id")
   tbl.final <- tbl[, coi]
   new.order <- order(tbl.final$start, decreasing=FALSE)
   tbl.final <- tbl.final[new.order,]

} # expandFimoTable
#------------------------------------------------------------------------------------------------------------------------
test_expandFimoTable <- function(tbl)
{
   printf("--- test_expandFimoTable")
   tbl.directFromFimo <- get(load("tbl.fimoBeforeColumnsFixes.RData"))
   tbl <- expandFimoTable(tbl.directFromFimo)

   checkEquals(nrow(tbl.directFromFimo), nrow(tbl))
   checkEquals(nrow(unique(tbl.directFromFimo)), nrow(tbl))
   checkTrue(all(tbl$chrom == "chr3"))
   checkEquals(sort(unique(tbl$tf)), c("CEBPZ", "FOXI1", "NFYA", "NFYB", "NFYC", "PBX3", "ZBTB14"))
   checkEquals(colnames(tbl), c("chrom", "start", "end", "tf", "strand", "score", "p.value", "matched_sequence", "motif_id"))

} # test_expandFimoTable
#------------------------------------------------------------------------------------------------------------------------
fimoBatch <- function(tbl.regions, matchThreshold, genomeName, pwmFile)
{
   resultsDirectory <- tempdir()

   if(!dir.exists(resultsDirectory))
     dir.create (resultsDirectory)

   fastaFilename <- file.path(resultsDirectory, "forFimo.fa")
   createFastaFileForFimo(tbl.regions, fastaFilename, genomeName)

   system.time(runFimo(fastaFilename, resultsDirectory, threshold=matchThreshold, pwmFile=pwmFile))
   fimo.results.file <- file.path(resultsDirectory, "forFimo.tsv")

   tbl.expanded <- data.frame()  # in case nothing is found

   if(file.exists(fimo.results.file)){
      if(file.size(fimo.results.file) > 0){
         tbl <- read.table(fimo.results.file, sep="\t", as.is=TRUE, nrow=-1, header=TRUE)  # two chopped names
         #tbl.fixed <- fixMotifNamesTruncatedAt100characters(tbl)
         tbl.expanded <- expandFimoTable(tbl)
         } # if non-zero size
      } # if results file exiss

   invisible(tbl.expanded)

} # fimoBatch
#------------------------------------------------------------------------------------------------------------------------
test_fimoBatch <- function()
{
   printf("--- test_fimoBatch")
   chomosome <- "chr3"
   start.loc <- 128383794 + 104170
   start.loc.2 <- 28000000
   end.loc <- start.loc + 30
   end.loc.2 <- start.loc.2 + 1000
   tbl.regions <- data.frame(chrom="chr3", start=start.loc, end=end.loc, stringsAsFactors=FALSE)
   tbl.match <- fimoBatch(tbl.regions, 1e-5, genomeName="hg38",
                          pwmFile="~/github/fimoService/pfms/human-jaspar2018-hocomoco-swissregulon.meme")
   checkEquals(dim(tbl.match), c(13, 9))

     # now send two regions
   tbl.regions.2 <- data.frame(chrom=c("chr3", "chr3"),
                               start=c(start.loc, start.loc.2),
                               end=c(end.loc,     end.loc.2),
                               stringsAsFactors=FALSE)
   tbl.match.2 <- fimoBatch(tbl.regions.2, 1e-6, genomeName="hg38",
                            pwmFile="~/github/fimoService/pfms/human-jaspar2018-hocomoco-swissregulon.meme")
   checkEquals(dim(tbl.match.2), c(7, 9))
     # make sure that both regions were used
   checkEquals(length(which(tbl.match.2$start < start.loc)), 4)
   checkEquals(length(which(tbl.match.2$start > start.loc)), 3)

} # test_fimoBatch
#-----------------------------------------------------------------------------------------------------------------------
PriceLab/ChIPseqMotifMatch documentation built on Jan. 23, 2020, 4:48 a.m.