TrenaProjectLymphocyte: Lymphocyte gene expression and variant data

library(RUnit)
library(trenaSGM)
library(TrenaProjectLymphocyte)
library(org.Hs.eg.db)
#------------------------------------------------------------------------------------------------------------------------
Sys.setlocale("LC_ALL", "C")
#------------------------------------------------------------------------------------------------------------------------
runTests <- function()
{
   test_BEND3()

} # runTests
#------------------------------------------------------------------------------------------------------------------------
test_BEND3 <- function()
{
   printf("--- test_BEND3")

   proj.L <- TrenaProjectLymphocyte()
   targetGene <- "BEND3"
   setTargetGene(proj.L, targetGene)
   xt <- getTranscriptsTable(proj.L)

   tbl.regions <- data.frame(chrom=xt$chrom, start=xt$end-1000, end=xt$end+2000, stringsAsFactors=FALSE)
   mtx.L <- getExpressionMatrix(proj.L, "GTEX.wholeBlood.rna-seq-geneSymbols.filtered")

   build.spec <- list(title="fp.2000up.1000down",
                      type="footprint.database",
                      regions=tbl.regions,
                      geneSymbol=targetGene,
                      tss=xt$tss,
                      matrix=mtx.L,
                      db.host="khaleesi.systemsbiology.net",
                      databases="lymphoblast_hint_20",
                      annotationDbFile=dbfile(org.Hs.eg.db),
                      motifDiscovery="builtinFimo",
                      tfPool=allKnownTFs(),
                      tfMapping="MotifDB",
                      tfPrefilterCorrelation=0.2,
                      orderModelByColumn="rfScore",
                      solverNames=c("lasso", "lassopv", "pearson", "randomForest", "ridge", "spearman"))

   fpBuilder <- FootprintDatabaseModelBuilder(getGenome(proj.L), targetGene,  build.spec, quiet=TRUE)
   x <-build(fpBuilder)
   lapply(x, dim)
   which(x$model$pearsonCoeff < 0)  # just #66
   tfs <- allKnownTFs()
   length(tfs)
   tfs.in.mtx <- intersect(tfs, rownames(mtx.L))
   length(tfs.in.mtx)  # 885
   cors <- lapply(tfs.in.mtx, function(tf) cor(mtx.L[tf,], mtx[targetGene,]))
   names(cors) <- tfs.in.mtx
   fivenum(as.numeric(cors)) # [1] -0.2420221  0.1791781  0.3808180  0.5242496  0.7565293
   length(which(cors < 0))   #37

} # test_BEND3
#------------------------------------------------------------------------------------------------------------------------
test_GTExSkin <- function()
{
   printf("--- test_GTExSkin")

   library(TrenaProjectSkin)
   proj.S <- TrenaProjectSkin()

   targetGene <- "BEND3"
   setTargetGene(proj.S, targetGene)
   xt <- getTranscriptsTable(proj.S)

   tbl.regions <- data.frame(chrom=xt$chrom, start=xt$end-1000, end=xt$end+2000, stringsAsFactors=FALSE)
   getExpressionMatrixNames(proj.S)
   mtx.S <- getExpressionMatrix(proj.S, "gtex.fibroblast")

   tfs <- allKnownTFs()
   length(tfs)
   tfs.in.mtx <- intersect(tfs, rownames(mtx.S))
   length(tfs.in.mtx)  # 885
   cors <- lapply(tfs.in.mtx, function(tf) cor(mtx.S[tf,], mtx.S[targetGene,]))
   names(cors) <- tfs.in.mtx
   fivenum(as.numeric(cors)) # [1] -0.2420221  0.1791781  0.3808180  0.5242496  0.7565293
   length(which(cors < 0))   #37
   hist(as.numeric(cors))

   getgenome <- "hg38"
   targetGene <- "hocusPocus"
   chromosome <- "chr6"
   upstream <- 2000
   downstream <- 200
   tss <- 41163186

      # strand-aware start and end: trem2 is on the minus strand
   start <- tss - downstream
   end   <- tss + upstream
   tbl.regions <- data.frame(chrom=chromosome, start=start, end=end, stringsAsFactors=FALSE)

   build.spec <- list(title="fp.2000up.200down",
                      type="footprint.database",
                      regions=tbl.regions,
                      tss=tss,
                      geneSymbol=targetGene,
                      matrix=mtx,
                      db.host="khaleesi.systemsbiology.net",
                      databases=list("brain_hint_20"),
                      annotationDbFile=dbfile(org.Hs.eg.db),
                      motifDiscovery="builtinFimo",
                      tfPool=allKnownTFs(),
                      tfMapping="MotifDB",
                      tfPrefilterCorrelation=0.4,
                      orderModelByColumn="rfScore",
                      solverNames=c("lasso", "lassopv", "pearson", "randomForest", "ridge", "spearman"))

     #------------------------------------------------------------
     # use the above build.spec: a small region, high correlation
     # required, MotifDb for motif/tf lookup
     #------------------------------------------------------------

   checkException(fpBuilder <- FootprintDatabaseModelBuilder(genome, targetGene, build.spec, quiet=TRUE), silent=TRUE)

} # test_targetGeneNotInExpressionMatrix
#------------------------------------------------------------------------------------------------------------------------
test_build.small.fimo.motifDB.mapping.cor04 <- function()
{
   printf("--- test_build.small.fimo.motifDB.mapping.cor04")

   genome <- "hg38"
   targetGene <- "TREM2"
   chromosome <- "chr6"
   upstream <- 2000
   downstream <- 200
   tss <- 41163186

      # strand-aware start and end: trem2 is on the minus strand
   start <- tss - downstream
   end   <- tss + upstream
   tbl.regions <- data.frame(chrom=chromosome, start=start, end=end, stringsAsFactors=FALSE)

   build.spec <- list(title="fp.2000up.200down",
                      type="footprint.database",
                      regions=tbl.regions,
                      tss=tss,
                      geneSymbol=targetGene,
                      matrix=mtx,
                      db.host="khaleesi.systemsbiology.net",
                      databases=list("brain_hint_20"),
                      annotationDbFile=dbfile(org.Hs.eg.db),
                      motifDiscovery="builtinFimo",
                      tfPool=allKnownTFs(),
                      tfMapping="MotifDB",
                      tfPrefilterCorrelation=0.4,
                      orderModelByColumn="rfScore",
                      solverNames=c("lasso", "lassopv", "pearson", "randomForest", "ridge", "spearman"))

     #------------------------------------------------------------
     # use the above build.spec: a small region, high correlation
     # required, MotifDb for motif/tf lookup
     #------------------------------------------------------------

   fpBuilder <- FootprintDatabaseModelBuilder(genome, targetGene, build.spec, quiet=TRUE)
   x <- build(fpBuilder)
   checkEquals(names(x), c("model", "regulatoryRegions"))
   tbl.regions <- x$regulatoryRegions
   tbl.model <- x$model
   tbl.model <- tbl.model[order(tbl.model$rfScore, decreasing=TRUE),]
   #browser()
   #xyz <- "test_FPDB, line 93"
   expected.tfs <- c("IKZF1", "CEBPA", "IRF8", "TAL1", "NR6A1", "IRF2")

   checkEquals(tbl.model$gene, expected.tfs)
   checkTrue(all(expected.tfs %in% tbl.regions$geneSymbol))
   checkTrue(all(tbl.regions$chrom == chromosome))
   checkTrue(all(tbl.regions$fp_start >= start))
   checkTrue(all(tbl.regions$fp_end <= end))

      #---------------------------------------------------------------------------
      # now split up the small 2200 bp promoter region into duplicate
      # chrom/start/stop rows.  the footprints and models return should be
      # identical with what was obtained above
      #---------------------------------------------------------------------------

   tbl.regions <- data.frame(chrom=rep(chromosome, 2),
                             start=rep(start, 2),
                             end=rep(end, 2),
                             stringsAsFactors=FALSE)
   build.spec$region <- tbl.regions
   fpBuilder.2 <- FootprintDatabaseModelBuilder(genome, targetGene, build.spec, quiet=TRUE)
   x.2 <- build(fpBuilder.2)

   checkEquals(names(x.2), c("model", "regulatoryRegions"))
   tbl.regions.2 <- x.2$regulatoryRegions
   tbl.model.2 <- x.2$model
   #tbl.model.2 <- tbl.model[order(tbl.model$rfScore, decreasing=TRUE),]
   expected.tfs <- c("IKZF1", "CEBPA", "IRF8", "TAL1", "NR6A1", "IRF2")

   #browser()
   checkEquals(tbl.model.2$gene, expected.tfs)
   checkTrue(all(expected.tfs %in% tbl.regions.2$geneSymbol))
   checkTrue(all(tbl.regions.2$chrom == chromosome))
   checkTrue(all(tbl.regions.2$fp_start >= start))
   checkTrue(all(tbl.regions.2$fp_end <= end))

   #browser()
   #xyz <- "test_FPDB, line 93"


      #---------------------------------------------------------------------------
      # now split up the small 2200 bp promoter region into two partly overlapping
      # chrom/start/stop rows.  the footprints and models return should be
      # identical with what was obtained above
      #---------------------------------------------------------------------------

   tbl.regions <- data.frame(chrom=rep(chromosome, 2),
                             start=c(41162986, 41163986),
                             end=c(41163886, 41165186),
                             stringsAsFactors=FALSE)
   build.spec$region <- tbl.regions
   fpBuilder.3 <- FootprintDatabaseModelBuilder(genome, targetGene, build.spec, quiet=TRUE)
   x.3 <- build(fpBuilder.3)

   checkEquals(names(x.3), c("model", "regulatoryRegions"))
   tbl.regions.3 <- x.3$regulatoryRegions
   tbl.model.3 <- x.3$model
   expected.tfs <- c("IKZF1", "CEBPA", "IRF8", "TAL1", "NR6A1", "IRF2")

   checkTrue(length(intersect(tbl.model.3$gene, expected.tfs)) > 3)  # allow for some stochasticity
   checkTrue(all(tbl.regions.3$chrom == chromosome))
   checkTrue(all(tbl.regions.3$fp_start >= start))
   checkTrue(all(tbl.regions.3$fp_end <= end))

} # test_build.small.fimo.motifDB.mapping.cor04
#------------------------------------------------------------------------------------------------------------------------
test_build.small.fimo.motifDB.mapping.cor.02 <- function()
{
   printf("--- test_build.small.fimo.motifDB.mapping.cor.02")

   targetGene <- "TREM2"
   chromosome <- "chr6"
   upstream <- 2000
   downstream <- 200
   tss <- 41163186
      # strand-aware start and end: trem2 is on the minus strand
   start <- tss - downstream
   end   <- tss + upstream
   tbl.regions <- data.frame(chrom=chromosome, start=start, end=end, stringsAsFactors=FALSE)

   build.spec <- list(title="fp.2000up.200down",
                      type="footprint.database",
                      regions=tbl.regions,
                      tss=tss,
                      geneSymbol=targetGene,
                      matrix=mtx,
                      db.host="khaleesi.systemsbiology.net",
                      databases=list("brain_hint_20"),
                      annotationDbFile=dbfile(org.Hs.eg.db),
                      motifDiscovery="builtinFimo",
                      tfPool=allKnownTFs(),
                      tfMapping="MotifDB",
                      tfPrefilterCorrelation=0.2,
                      orderModelByColumn="rfScore",
                      solverNames=c("lasso", "lassopv", "pearson", "randomForest", "ridge", "spearman"))

   fpBuilder <- FootprintDatabaseModelBuilder("hg38", "TREM2", build.spec, quiet=TRUE)
   x <- build(fpBuilder)
   tbl.regions <- x$regulatoryRegions
   tbl.model <- x$model
   tbl.model <- tbl.model[order(tbl.model$rfScore, decreasing=TRUE),]
   checkTrue(nrow(tbl.model) > 20)
   top.tfs <- subset(tbl.model, rfScore >= 4)$gene
   checkTrue(all(top.tfs %in% tbl.regions$geneSymbol))

} # test_build.small.fimo.motifDB.mapping.cor02
#------------------------------------------------------------------------------------------------------------------------
test_build.10kb.fimo.motifDB.mapping.cor04 <- function()
{
   printf("--- test_build.10kb.fimo.motifDB.mapping.cor04")

   targetGene <- "TREM2"
   chromosome <- "chr6"
   upstream <- 5000
   downstream <- 5000
   tss <- 41163186
      # strand-aware start and end: trem2 is on the minus strand
   start <- tss - downstream
   end   <- tss + upstream
   tbl.regions <- data.frame(chrom=chromosome, start=start, end=end, stringsAsFactors=FALSE)

   build.spec <- list(title="fp.5kbup.5kbdown",
                      type="footprint.database",
                      geneSymbol=targetGene,
                      regions=tbl.regions,
                      tss=tss,
                      matrix=mtx,
                      db.host="khaleesi.systemsbiology.net",
                      databases=list("brain_hint_20"),
                      annotationDbFile=dbfile(org.Hs.eg.db),
                      motifDiscovery="builtinFimo",
                      tfPool=allKnownTFs(),
                      tfMapping="MotifDB",
                      tfPrefilterCorrelation=0.4,
                      orderModelByColumn="rfScore",
                      solverNames=c("lasso", "lassopv", "pearson", "randomForest", "ridge", "spearman"))

     #------------------------------------------------------------
     # use the above build.spec: a small region, high correlation
     # required, MotifDb for motif/tf lookup
     #------------------------------------------------------------

   fpBuilder <- FootprintDatabaseModelBuilder("hg38", "TREM2", build.spec, quiet=TRUE)
   x <- build(fpBuilder)
   tbl.regions <- x$regulatoryRegions
   tbl.model <- x$model
   tbl.model <- tbl.model[order(tbl.model$rfScore, decreasing=TRUE),]
   checkTrue(nrow(tbl.model) > 10)
   top.tfs <- subset(tbl.model, rfScore >= 4)$gene
   checkTrue(all(top.tfs %in% tbl.regions$geneSymbol))

} # test_build.10kb.fimo.motifDB.mapping.cor04
#------------------------------------------------------------------------------------------------------------------------
test_build.10kb.fimo.tfclass.mapping.cor04 <- function()
{
   printf("--- test_build.10kb.fimo.tfclass.mapping.cor04")

   targetGene <- "TREM2"
   chromosome <- "chr6"
   upstream <- 5000
   downstream <- 5000
   tss <- 41163186
      # strand-aware start and end: trem2 is on the minus strand
   start <- tss - downstream
   end   <- tss + upstream
   tbl.regions <- data.frame(chrom=chromosome, start=start, end=end, stringsAsFactors=FALSE)

   build.spec <- list(title="fp.5kbup.5kbdown",
                      type="footprint.database",
                      geneSymbol=targetGene,
                      regions=tbl.regions,
                      tss=tss,
                      matrix=mtx,
                      db.host="khaleesi.systemsbiology.net",
                      databases=list("brain_hint_20"),
                      annotationDbFile=dbfile(org.Hs.eg.db),
                      motifDiscovery="builtinFimo",
                      tfPool=allKnownTFs(),
                      tfMapping="TFClass",
                      tfPrefilterCorrelation=0.4,
                      orderModelByColumn="rfScore",
                      solverNames=c("lasso", "lassopv", "pearson", "randomForest", "ridge", "spearman"))

     #------------------------------------------------------------
     # use the above build.spec: a small region, high correlation
     # required, MotifDb for motif/tf lookup
     #------------------------------------------------------------

   fpBuilder <- FootprintDatabaseModelBuilder("hg38", "TREM2", build.spec, quiet=TRUE)
   x <- build(fpBuilder)
   tbl.regions <- x$regulatoryRegions
   tbl.model <- x$model
   tbl.model <- tbl.model[order(tbl.model$rfScore, decreasing=TRUE),]
   checkTrue(nrow(tbl.model) > 10)
   top.10.tfs <- tbl.model$gene[1:10]   # already sorted by rfScore
   expected.tfs <- c("SPI1", "TFEC", "CEBPA", "BACH1", "TAL1", "FLI1", "ELK3", "KLF3", "FOXP1", "MESP1")
      # allow for some randomness in the result
   checkTrue(length(intersect(top.10.tfs, expected.tfs)) >= 8)
   checkTrue(all(top.10.tfs %in% tbl.regions$geneSymbol))

} # test_build.10kb.fimo.motifDB.mapping.cor04
#------------------------------------------------------------------------------------------------------------------------
test_reproduceCorysTrem2model <- function()
{
   printf("--- test_reproduceCorysTrem2model")
   tss <- 41163186
      # strand-aware start and end: trem2 is on the minus strand

   build.spec <- list(title="fp.enhancers",
                      type="footprint.database",
                      geneSymbol="TREM2",
                      regions=tbl.enhancers,
                      tss=tss,
                      matrix=mtx,
                      db.host="khaleesi.systemsbiology.net",
                      databases=c("brain_hint_20", "brain_hint_16", "brain_wellington_20", "brain_wellington_16"),
                      motifDiscovery="builtinFimo",
                      annotationDbFile=dbfile(org.Hs.eg.db),
                      tfPool=allKnownTFs(),
                      tfMapping=c("TFClass", "MotifDb"),
                      tfPrefilterCorrelation=0.4,
                      orderModelByColumn="rfScore",
                      solverNames=c("lasso", "lassopv", "pearson", "randomForest", "ridge", "spearman"))

   fpBuilder <- FootprintDatabaseModelBuilder("hg38", "TREM2", build.spec, quiet=FALSE)
   x <- build(fpBuilder)
   tbl.regions <- x$regulatoryRegions
   tbl.model <- x$model
   checkTrue(nrow(tbl.model) > 20)
   top.tfs <- subset(tbl.model, rfScore >= 4)$gene
   checkTrue(all(top.tfs %in% tbl.regions$geneSymbol))
   checkTrue(all(c("IKZF1", "IRF5", "LYL1", "SPI1") %in% top.tfs))

      # now compare to cory's previous model, which has been designated the reference
   top.tfs.bothModels <- intersect(rownames(subset(tbl.trena, pearsonCoeff >= 0.4))[1:10], tbl.model$gene[1:10])
   checkTrue(length(top.tfs.bothModels) >= 8)  # usually exactly 9, with only BHLHE41 missing

      # the missing tf is BHLHE41, which is a poor-ish fit (~80%) to MA0636.1
      # and a better fit for the more relaxed MA0692.1
      # to see this:
      #    x <- query(MotifDb, "jaspar2018", c("MA0636.1", "MA0692.1"))
      #    motifStack(lapply(names(x), function(mName) new("pfm", x[[mName]], name=mName)))
      # we had previously mapped the permissive MA0692.1 to BHLHE41, but
      # this (mysteriously, but appropriately) is no longer the case:
      #
      # motifToGene(MotifDb, "MA0692.1", "MotifDb")
      #      motif geneSymbol pubmedID organism  source
      # 1 MA0692.1       TFEB 24194598 Hsapiens MotifDb
      # motifToGene(MotifDb, "MA0692.1", "TFClass")
      #       motif geneSymbol pubmedID  source organism
      # 1  MA0692.1       TFEB 23180794 TFClass Hsapiens
      # 2  MA0692.1       MXI1 23180794 TFClass Hsapiens
      # 3  MA0692.1        MNT 23180794 TFClass Hsapiens
      # 4  MA0692.1        MLX 23180794 TFClass Hsapiens
      # 5  MA0692.1        MAX 23180794 TFClass Hsapiens
      # 6  MA0692.1       USF1 23180794 TFClass Hsapiens
      # 7  MA0692.1       USF2 23180794 TFClass Hsapiens
      # 8  MA0692.1       TFE3 23180794 TFClass Hsapiens
      # 9  MA0692.1       TFEC 23180794 TFClass Hsapiens
      # 10 MA0692.1       MITF 23180794 TFClass Hsapiens
      # 11 MA0692.1       MYCN 23180794 TFClass Hsapiens
      # and the single consistent mappoing for MA0636.1
      #   motifToGene(MotifDb, "MA0636.1", "MotifDb")
      #       motif geneSymbol pubmedID organism  source
      #  1 MA0636.1    BHLHE41 24194598 Hsapiens MotifDb
      #  motifToGene(MotifDb, "MA0636.1", "TFClass")
      #       motif geneSymbol pubmedID  source organism
      #  1 MA0636.1    BHLHE41 23180794 TFClass Hsapiens

   matched.tfs <- intersect(rownames(tbl.trena)[1:10], tbl.model$gene[1:10])
   missing.tfs <- setdiff(rownames(tbl.trena)[1:10], tbl.model$gene[1:10])
   checkTrue(length(matched.tfs) >= 8)
      #  checkTrue("BHLHE41" %in% missing.tfs)   # usually true, but stochasticity can interfere

      # now take a deeper look.   agreement is still pretty good
   checkEquals(length(intersect(tbl.model$gene, rownames(subset(tbl.trena, pearsonCoeff >= 0.4)))), 19)

} # test_reproduceCorysTrem2model
#------------------------------------------------------------------------------------------------------------------------
viz.corysTrem2Model <- function()
{
   library(igvR)
   library(motifStack)
   igv <- igvR()
   setGenome(igv, "hg38")
   if(!exists("tbl.enhancers"))
      print(load(system.file(package="trenaSGM", "extdata", "enhancers.TREM2.RData")))

   bigRegion <- with(tbl.enhancers, sprintf("%s:%d-%d", unique(chrom), min(start)-2000, max(end)+2000))
   showGenomicRegion(igv, bigRegion)
   track <- DataFrameAnnotationTrack("GeneHancer 4.6", tbl.enhancers, color="black")
   displayTrack(igv, track)

        #---- v4.7 enhancers
    load("../extdata/enhancers.v47.TREM2.RData")
    trackv47 <- DataFrameAnnotationTrack("GeneHancer 4.7", tbl.enhancers, color="purple")
    displayTrack(igv, trackv47)

     #--- the LYL1 mystery: 45 footprints which trenaSGM have not yet found
   tbl.lyl1 <- subset(tbl.footprints, tf == "LYL1")
   track.lyl1 <- DataFrameAnnotationTrack("LYL1", tbl.lyl1[, c("chrom", "fp_start", "fp_end", "motifName")], color="darkred")
   displayTrack(igv, track.lyl1)
   long.motif.names <- unique(tbl.lyl1$motifName)   # just 4
   motifStack(lapply(long.motif.names, function(mName) new("pfm", MotifDb[[mName]], name=mName)))
   short.motif.names <- mcols(MotifDb[long.motif.names])$providerId   # [1] "MA0091.1" "MA0140.2" "MA0092.1" "MA0048.2"


   tbl.ikzf1 <- subset(tbl.footprints, tf=="IKZF1")
   dups <- which(duplicated(tbl.ikzf1$loc))
   if(length(dups) > 0)
      tbl.ikzf1 <- tbl.ikzf1[-dups,]
   track <- DataFrameAnnotationTrack("cory IKZF1", tbl.ikzf1[, c("chrom", "start", "endpos")], color="green")
   displayTrack(igv, track)
   dim(subset(tbl.footprints, tf=="IKZF1"))  # [1] 40 20

} # viz.corysTrem2Model
#------------------------------------------------------------------------------------------------------------------------
queryDB <- function()
{
   library(RPostgreSQL)
   dbConnection <- dbConnect(PostgreSQL(), user="trena", password="trena", host="khaleesi", dbname="brain_hint_20")
   tbl.regions <- data.frame(chrom="chr6", start=41200271, end=41200306, stringsAsFactors=FALSE)
   tbl.hits <- trenaSGM:::.multiQueryFootprints(dbConnection, tbl.regions)

} # queryDB
#------------------------------------------------------------------------------------------------------------------------
find.tf.bindingSites <- function(tf)
{
   mdb.tf <- query(MotifDb, c("hsapiens", "jaspar2018", "BHLHE41"))
   stopifnot(length(mdb.tf) > 0)
   motifs <- mcols(mdb.tf)$providerId

   print(load(system.file(package="trenaSGM", "extdata", "enhancers.TREM2.RData")))  # tbl.enhancers
   bigRegion <- with(tbl.enhancers, sprintf("%s:%d-%d", unique(chrom), min(start)-2000, max(end)+2000))
   mm <- MotifMatcher("hg38", pfms=as.list(mdb.tf))

   tbl.motifs <- findMatchesByChromosomalRegion(mm, tbl.enhancers, pwmMatchMinimumAsPercentage=80)
   track <- DataFrameAnnotationTrack("MA0636.1", tbl.motifs[, c("chrom", "motifStart", "motifEnd")], color="green")
   displayTrack(igv, track)

   tbl.bhlhe41 <- subset(tbl.footprints, tf=="BHLHE41")
   track <- DataFrameAnnotationTrack("fimo-BHLHE41", tbl.bhlhe41[, c("chrom", "fp_start", "fp_end")], color="blue")
   displayTrack(igv, track)


} # find.tf.bindingSites
#------------------------------------------------------------------------------------------------------------------------
test_tfPoolOption <- function()
{
   genome <- "hg38"
   targetGene <- "TREM2"
   chromosome <- "chr6"
   upstream <- 2000
   downstream <- 200
   tss <- 41163186

      # strand-aware start and end: trem2 is on the minus strand
   start <- tss - downstream
   end   <- tss + upstream
   tbl.regions <- data.frame(chrom=chromosome, start=start, end=end, stringsAsFactors=FALSE)

   build.spec <- list(title="fp.2000up.200down",
                      type="footprint.database",
                      geneSymbol=targetGene,
                      regions=tbl.regions,
                      tss=tss,
                      matrix=mtx,
                      db.host="khaleesi.systemsbiology.net",
                      databases=list("brain_hint_20"),
                      annotationDbFile=dbfile(org.Hs.eg.db),
                      motifDiscovery="builtinFimo",
                      tfPool=allKnownTFs(),
                      tfMapping="MotifDB",
                      tfPool=allKnownTFs(),
                      tfPrefilterCorrelation=0.4,
                      orderModelByColumn="rfScore",
                      solverNames=c("lasso", "lassopv", "pearson", "randomForest", "ridge", "spearman"))

     #------------------------------------------------------------
     # use the above build.spec: a small region, high correlation
     # required, MotifDb for motif/tf lookup, all known tfs.
     #------------------------------------------------------------

   builder <- FootprintDatabaseModelBuilder(genome, targetGene, build.spec, quiet=TRUE)
   x <- build(builder)

   build.spec$tfPool <- c("IKZF1", "TAL1")
   builder <- FootprintDatabaseModelBuilder(genome, targetGene, build.spec, quiet=TRUE)
   x <- build(builder)
   checkEquals(sort(x$model$gene), sort(build.spec$tfPool))
   checkEquals(sort(unique(x$regulatoryRegions$geneSymbol)), sort(build.spec$tfPool))

} # test_tfPoolOption
#------------------------------------------------------------------------------------------------------------------------
# the footprint databases (as of jun 20 2018) name footprint/motif-associated TFs by their HGNC gene symbols
# to support trena analysis on expression data expressed with ensg (ensembl gene) IDs, we need to translate
# those footprint tfs from geneSymbol to ensembl ids.  develop and test that here
#
# use this mapping
#    SYMBOL         ENSEMBL
# 1   PRDM2 ENSG00000116731
# 2   EOMES ENSG00000163508
# 3    ELF2 ENSG00000109381
# 4   TREM2 ENSG00000095970
# 5   TBPL1 ENSG00000028839
# 6    IRF5 ENSG00000128604
# 7    SPI1 ENSG00000066336
# 8   PRDM4 ENSG00000110851
# 9    EBF4 ENSG00000088881
# 10   LYL1 ENSG00000104903
# 11  IKZF1 ENSG00000185811
# 12   TFEC ENSG00000105967
# 13  DMRT2 ENSG00000173253
test_modelExpressionDataWithEnsgIDs <- function()
{
   printf("--- test_ensgIDs")

      # create a smallish expression matrix with ENSEMBL gene ids as rownames
      # starting with mtx.tcx for which we have a good gene-symbol-centric model

   tfs.highRanked <- c("IKZF1", "IRF5",  "LYL1",  "SPI1",  "TFEC")
   set.seed(17)
   tfs.randomUnranked <- allKnownTFs()[sample(1:1000, 100)]
   all.genes <- intersect(rownames(mtx), c("TREM2", tfs.highRanked, tfs.randomUnranked))
   suppressMessages(tbl.map <- select(org.Hs.eg.db, keys=all.genes, keytype="SYMBOL", columns="ENSEMBL"))
   dups <- which(duplicated(tbl.map$SYMBOL))
   if(length(dups) > 0)
      tbl.map <- tbl.map[-dups,]

   checkTrue(all(tbl.map$SYMBOL %in% rownames(mtx)))
   mtx.sub <- mtx[tbl.map$SYMBOL,]
   indices <- match(tbl.map$SYMBOL, rownames(mtx.sub))
   rownames(mtx.sub) <- tbl.map$ENSEMBL[indices]

   genome <- "hg38"
   targetGene <- "ENSG00000095970"  # "TREM2"
   chromosome <- "chr6"
   upstream <- 2000
   downstream <- 200
   tss <- 41163186

      # strand-aware start and end: trem2 is on the minus strand
   start <- tss - downstream
   end   <- tss + upstream
   tbl.regions <- data.frame(chrom=chromosome, start=start, end=end, stringsAsFactors=FALSE)

   build.spec <- list(title="fp.ensemblTest",
                      type="footprint.database",
                      geneSymbol=targetGene,
                      regions=tbl.regions,
                      tss=tss,
                      matrix=mtx.sub,
                      db.host="khaleesi.systemsbiology.net",
                      databases=list("brain_hint_20"),
                      annotationDbFile=dbfile(org.Hs.eg.db),
                      motifDiscovery="builtinFimo",
                      tfMapping="MotifDB",
                      tfPool=allKnownTFs(identifierType="ensemblGeneID"),
                      tfPrefilterCorrelation=0.4,
                      orderModelByColumn="rfScore",
                      solverNames=c("lasso", "lassopv", "pearson", "randomForest", "ridge", "spearman"))

     #------------------------------------------------------------
     # use the above build.spec: a small region, high correlation
     # required, MotifDb for motif/tf lookup, all known tfs.
     #------------------------------------------------------------

   builder <- FootprintDatabaseModelBuilder(genome, targetGene, build.spec, quiet=TRUE)
   x <- build(builder)
   checkTrue("ENSG00000185811" %in% x$model$gene)

} # test_ensgIDs
#------------------------------------------------------------------------------------------------------------------------
test_noGenesAboveExpressionCorrelationThreshold <- function()
{
   printf("--- test_noGenesAboveExpressionCorrelationThreshold")

   genome <- "hg38"
   targetGene <- "TREM2"
   chromosome <- "chr6"
   upstream <- 2000
   downstream <- 200
   tss <- 41163186

      # strand-aware start and end: trem2 is on the minus strand
   start <- tss - downstream
   end   <- tss + upstream
   tbl.regions <- data.frame(chrom=chromosome, start=start, end=end, stringsAsFactors=FALSE)

   build.spec <- list(title="fp.2000up.200down",
                      type="footprint.database",
                      geneSymbol=targetGene,
                      regions=tbl.regions,
                      tss=tss,
                      matrix=mtx,
                      db.host="khaleesi.systemsbiology.net",
                      databases=list("brain_hint_20"),
                      annotationDbFile=dbfile(org.Hs.eg.db),
                      motifDiscovery="builtinFimo",
                      tfPool=allKnownTFs(),
                      tfMapping="MotifDB",
                      tfPool=allKnownTFs(),
                      tfPrefilterCorrelation=0.99,
                      orderModelByColumn="rfScore",
                      solverNames=c("lasso", "lassopv", "pearson", "randomForest", "ridge", "spearman"))

     #------------------------------------------------------------
     # use the above build.spec: a small region, high correlation
     # required, MotifDb for motif/tf lookup, all known tfs.
     #------------------------------------------------------------

   builder <- FootprintDatabaseModelBuilder(genome, targetGene, build.spec, quiet=TRUE)
   x <- build(builder)
   checkEquals(names(x), c("model", "regulatoryRegions"))
   checkEquals(nrow(x$model), 0)
   checkEquals(nrow(x$regulatoryRegions), 0)

} # test_noGenesAboveExpressionCorrelationThreshold()
#------------------------------------------------------------------------------------------------------------------------
test_stagedExecution <- function()
{
   printf("--- test_stagedExecution")

   genome <- "hg38"
   targetGene <- "TREM2"
   chromosome <- "chr6"
   upstream <- 2000
   downstream <- 200
   tss <- 41163186

      # strand-aware start and end: trem2 is on the minus strand
   start <- tss - downstream
   end   <- tss + upstream
   tbl.regions <- data.frame(chrom=chromosome, start=start, end=end, stringsAsFactors=FALSE)

   build.spec <- list(title="fp.2000up.200down",
                      type="footprint.database",
                      geneSymbol=targetGene,
                      regions=tbl.regions,
                      tss=tss,
                      matrix=mtx,
                      db.host="khaleesi.systemsbiology.net",
                      databases=list("brain_hint_20"),
                      annotationDbFile=dbfile(org.Hs.eg.db),
                      motifDiscovery="builtinFimo",
                      tfPool=allKnownTFs(),
                      tfMapping="MotifDB",
                      tfPrefilterCorrelation=0.2,
                      orderModelByColumn="rfScore",
                      solverNames=c("lasso", "lassopv", "pearson", "randomForest", "ridge", "spearman"))

     #------------------------------------------------------------
     # use the above build.spec: a small region, high correlation
     # required, MotifDb for motif/tf lookup
     #------------------------------------------------------------

   # stageDir <- "stage"
   stageDir <- tempdir()

   fpBuilder <- FootprintDatabaseModelBuilder(genome, targetGene, build.spec, quiet=FALSE,
                                              stagedExecutionDirectory=stageDir)
   fp.filename <- staged.build(fpBuilder, stage="find.footprints")
   checkTrue(file.exists(fp.filename))

   fp.tfMapped.filename <- staged.build(fpBuilder, stage="associateTFs")
   checkTrue(file.exists(fp.tfMapped.filename))

   models.filename <- staged.build(fpBuilder, stage="build.models")

   load(models.filename)
   checkEquals(names(tbls), c("model", "regulatoryRegions"))
   tbl.regions <- tbls$regulatoryRegions
   tbl.model <- tbls$model
   tbl.model <- tbl.model[order(tbl.model$rfScore, decreasing=TRUE),]
   top.tfs <- head(tbl.model$gene)
       # allow for stochasticity, check just 3 of those 6 top.tfs
   checkTrue(all(c("IKZF1", "CEBPA", "IRF8") %in% top.tfs))
   tbl.regions <- tbls$regulatoryRegions
   tfs.in.regions <- unique(tbl.regions$geneSymbol)
      # we keep only regions with tfs in model.  check that
   checkTrue(all(tfs.in.regions %in% tbl.model$gene))

} # test_stagedExecution
#------------------------------------------------------------------------------------------------------------------------
if(!interactive())
   runTests()
PriceLab/TrenaProjectLymphocyte documentation built on Sept. 2, 2019, 8 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
PriceLab/TrenaProjectLymphocyte
Lymphocyte gene expression and variant data

inst/unitTests/test_findNegativeRegulators.R
In PriceLab/TrenaProjectLymphocyte: Lymphocyte gene expression and variant data

R Package Documentation

Browse R Packages

We want your feedback!

PriceLab/TrenaProjectLymphocyte Lymphocyte gene expression and variant data

inst/unitTests/test_findNegativeRegulators.R In PriceLab/TrenaProjectLymphocyte: Lymphocyte gene expression and variant data

R Package Documentation

Browse R Packages

We want your feedback!

PriceLab/TrenaProjectLymphocyte
Lymphocyte gene expression and variant data

inst/unitTests/test_findNegativeRegulators.R
In PriceLab/TrenaProjectLymphocyte: Lymphocyte gene expression and variant data