misc/erythropoiesis/marjorieDemos/tms.R

library(TrenaMultiScore)
library(TrenaProjectErythropoiesis)
library(RUnit)
library(factoextra)
library(RSQLite)
library(ghdb)
library(httr)
#------------------------------------------------------------------------------------------------------------------------
if(!exists("ghdb"))
    ghdb <- GeneHancerDB()
if(!exists("tbl.atac"))
   tbl.atac <- get(load("~/github/TrenaProjectErythropoiesis/misc/multiScore/brandAtacSeqCuration/tbl.atac.fp.RData"))
#------------------------------------------------------------------------------------------------------------------------
if(!exists("tmse")) {
   message(sprintf("--- creating instance of TrenaMultiScore"))
   tpe <- TrenaProjectErythropoiesis()
   tmse <- TrenaMultiScore(tpe, "TAL1");
   }

# from marjorie (email 29 aug 2020) use HSC/MPP for Day0 and CMP for Day2
if(!exists("tbl.atac.corces")){
    tbl.corces.full <- get(load("~/github/TrenaProjectErythropoiesis/prep/import/buenrostro/GSE74912-atacSeq.RData"))
    hsc.cols <- grep("\\.HSC", colnames(tbl.corces.full), ignore.case=TRUE)
    mpp.cols <- grep("MPP", colnames(tbl.corces.full), ignore.case=TRUE)
    cmp.cols <- grep("CMP", colnames(tbl.corces.full), ignore.case=TRUE)
    tbl.corces <- tbl.corces.full[, c(1:3, hsc.cols, mpp.cols, cmp.cols)]
    dim(tbl.corces)
    colnames(tbl.corces) <- c("chrom", "start", "end", "day0.1", "day0.2", "day0.3", "day0.4", "day2.1", "day2.2")
    tbl.atac.corces <- tbl.corces
    tbl.atac.corces.day0.quiet <- subset(tbl.atac.corces, day0.1 == 0 & day0.2 == 0 & day0.3==0 & day0.4 == 0 & day2.1 > 0)
    tbl.atac.corces.day0.quiet <- subset(tbl.atac.corces, day0.1 == 0 & day0.2 == 0 & (day2.1 > 0 | day2.2 > 0))
    tbl.atac.corces.day0.quiet <- subset(tbl.atac.corces, (day0.1 + day0.2 + day0.3 + day0.4) < 3 & (day2.1 + day2.2 > 6))
    }

#------------------------------------------------------------------------------------------------------------------------
# genes with zero-to-high expression between day 0 and day 2, and open chromatin at day 2
findCandidates <- function()
{
   expected <- c("brandLabDifferentiationTimeCourse-16173x28", "brandLabDifferentiationTimeCourse-27171x28")
   checkTrue(all(expected %in% getExpressionMatrixNames(tpe)))
   mtx <- getExpressionMatrix(tpe, expected[1])

   cutoff.1 <- 0.5
   cutoff.2 <- 1

   goi.0 <- names(which(sapply(rownames(mtx), function(geneName) all(mtx[geneName, 1:2] < cutoff.1))))
   print(length(goi.0))
   goi.2 <- names(which(sapply(rownames(mtx), function(geneName) all(mtx[geneName, 3:4] > cutoff.2))))
   print(length(goi.2))

   length(intersect(goi.0, goi.2))
   goi <-intersect(goi.0, goi.2)
   length(goi)

   tissues <- "Common myeloid progenitor CD34+"
   tissues <- "all"
   tbls.gh <- lapply(goi, function(gene) retrieveEnhancersFromDatabase(ghdb, gene, tissues=tissues))

   tbls.ghp <- lapply(goi, function(gene){
       tbl.gh <- retrieveEnhancersFromDatabase(ghdb, gene, tissues=tissues)
       if(nrow(tbl.gh) == 0)
         return(data.frame())
       subset(tbl.gh, combinedscore > 500)
       })

   length(tbls.ghp)
   names(tbls.ghp) <- goi
   tbls.ghp <- tbls.ghp[names(which(lapply(tbls.ghp, nrow) > 0))]
   length(tbls.ghp)

   # displayTrack(igv, DataFrameQuantitativeTrack("GH.CCL1", tbls.gh[["CCL1"]][, c("chrom", "start", "end", "combinedscore")], color="random", autoscale=FALSE, min=0,max=50))

   ghAndATAC <- function(tbl.gh){
       tbl.ov <- as.data.frame(findOverlaps(GRanges(tbl.gh[1, c("chrom", "start", "end")]),
                                            GRanges(tbl.atac.corces.day0.quiet[,  c("chrom", "start", "end")]),
                                            type="any"))
       if(nrow(tbl.ov) > 0)
           return(tbl.atac.corces.day0.quiet[tbl.ov$subjectHits,])
       return(data.frame())
       } # ghAndATAC

   tbls.ghp.atacPattern <- lapply(tbls.ghp, ghAndATAC)
   tbls.ghp.atacPattern <- tbls.ghp.atacPattern[names(which(lapply(tbls.ghp.atacPattern, nrow) > 0))]
   length(tbls.ghp.atacPattern)

   tbl.ccl1 <- ghAndATAC(tbls.gh[["CCL1"]])
   displayTrack(igv, DataFrameQuantitativeTrack("CCL1-atac-HSC.r1", tbl.ccl1[, c(1:3, 4)], color="random", autoscale=FALSE, min=0, max=20))
   displayTrack(igv, DataFrameQuantitativeTrack("CCL1-atac-HSC.r2", tbl.ccl1[, c(1:3, 5)], color="random", autoscale=FALSE, min=0, max=20))
   displayTrack(igv, DataFrameQuantitativeTrack("CCL1-atac-MPP.r1", tbl.ccl1[, c(1:3, 6)], color="random", autoscale=FALSE, min=0, max=20))
   displayTrack(igv, DataFrameQuantitativeTrack("CCL1-atac-MPP.r2", tbl.ccl1[, c(1:3, 7)], color="random", autoscale=FALSE, min=0, max=20))
   displayTrack(igv, DataFrameQuantitativeTrack("CCL1-atac-CMP.r1", tbl.ccl1[, c(1:3, 8)], color="random", autoscale=FALSE, min=0, max=20))
   displayTrack(igv, DataFrameQuantitativeTrack("CCL1-atac-CMP.r2", tbl.ccl1[, c(1:3, 9)], color="random", autoscale=FALSE, min=0, max=20))

   genes.filtered <- names (tbls.ghp.atacPattern)
   uri <- sprintf("http://localhost:8000/goEnrich")
   body.jsonString <- sprintf('%s', toJSON(list(geneSymbols=genes.filtered)))

   r <- POST(uri, body=body.jsonString)

      #sprintf('{"geneSymbols": "%s"}', goi.string))
   tbl <- fromJSON(content(r)[[1]])
   dim(tbl)
   wdth(1000)
   head(tbl, n=10)


} # findCandidates
#------------------------------------------------------------------------------------------------------------------------
build.model <- function(targetGene, fimoThresholdAsNegativeExponent=5, tbl.openChromatin=data.frame())
{
   printf("=========== building model for %s, fimoThreshold: %f", targetGene,
          fimoThresholdAsNegativeExponent)

   results.subDirectory <- sprintf("fimo%d", fimoThresholdAsNegativeExponent)
   filename <- sprintf("%s.RData", targetGene)
   if(!file.exists(results.subDirectory))
      dir.create(results.subDirectory)

   tms.tg <- TrenaMultiScore(tpe, targetGene);
   printf("--- getGeneHancerRegion")
   gh.span <- as.list(getGeneHancerRegion(tms.tg))

   printf("--- getOpenChromatin")
   if(nrow(tbl.openChromatin) == 0)
      findOpenChromatin(tms.tg)
   else
      tms.tg@state$openChromatin <- tbl.openChromatin
   if(nrow(getOpenChromatin(tms.tg)) == 0){
      message(sprintf("no open chromatin for %s, bailing out, saving empty model", targetGene))
      tbl <- data.frame()
      save(tbl, file=file.path(results.subDirectory, filename))
      return(data.frame())
      }
   fimoThreshold <- 10^(-fimoThresholdAsNegativeExponent)
   printf("--- findFimoTFBS")

   jaspar2018.human <- query(MotifDb, c("sapiens", "jaspar2018"))
   hocomocov11.core <- query(MotifDb, "hocomocov11-core")
   motifs <- c(jaspar2018.human,hocomocov11.core)
   printf("--- using %d motifs only for fimo", length(motifs))
   findFimoTFBS(tms.tg, motifs=motifs, fimo.threshold=fimoThreshold)
   printf("--- scoreMotifHitsForConservation")
   scoreMotifHitsForConservation(tms.tg)
   printf("--- scoreMotifHitsForGeneHancer")
   scoreMotifHitsForGeneHancer(tms.tg)
   addDistanceToTSS(tms.tg)

   mtx <- getExpressionMatrix(tpe, "brandLabDifferentiationTimeCourse-27171x28")
   printf("--- addGeneExpressionCorrelations")
   addGeneExpressionCorrelations(tms.tg, mtx)
   printf("--- addGenicAnnotations")
   addGenicAnnotations(tms.tg)
   printf("--- addChip")
   addChIP(tms.tg)

   tbl <- getMultiScoreTable(tms.tg)
   tbl$cor[which(is.na(tbl$cor))] <- 0
   tbl$motifScore <- round(-log10(tbl$p.value), 2)
   tbl$targetGene <- targetGene

   printf("--- model for %s has %d rows", targetGene, nrow(tbl))
   if(nrow(tbl) > 0){
      filePath <- file.path(results.subDirectory, filename)
      save(tbl, file=filePath)
      message(sprintf("saving %d rows model for %s to %s", nrow(tbl), targetGene, filePath))
      }

   invisible(tbl)

} # build.model
#------------------------------------------------------------------------------------------------------------------------
# goi <- function()
# {
#    tbl <- read.table("G2vsG3GenesUP.txt", sep="\t", as.is=TRUE, header=TRUE, nrow=100)
#    dim(tbl)
#    genes <- tbl$GeneName
#    additional.genes <- read.table(file="additionalGenes.txt", stringsAsFactors=FALSE)$V1
#    length(genes)
#    length(additional.genes)
#    all.goi <- sort(unique(c(genes, additional.genes)))
#    length(all.goi)
#    library(org.Hs.eg.db)
#    tbl.ref <- select(org.Hs.eg.db, key=all.goi, columns=c("SYMBOL", "ENTREZID"), keytype="SYMBOL")
#    successes <- tbl.ref[which(!is.na(tbl.ref$ENTREZID)),]$SYMBOL
#    failures <- tbl.ref[which(is.na(tbl.ref$ENTREZID)),]$SYMBOL
#    tbl.ref2 <- select(org.Hs.eg.db, key=failures, columns=c("SYMBOL", "ALIAS", "ENTREZID"), keytype="ALIAS")
#    successes.round2 <- tbl.ref2[which(!is.na(tbl.ref2$SYMBOL)), "SYMBOL"]
#    goi.final <- sort(unique(c(successes, successes.round2)))
#    length(goi.final)  # 112
#
#    return(goi.final)
#
#} # goi
#------------------------------------------------------------------------------------------------------------------------
buildAll <- function(goi, fimoThresholdAsNegativeExponent)
{
  #if(!exists("haney.erythropoiesis.tfs"))
  #   source("~/github/regulatoryGenomePaper/demos/common.R")

  #tfs.oi <- c("GATA1", "GATA2", "FLI1", "SPI1")
  #tfs.oi <- goi()[24:112]
  #tfs.oi <- goi()
  printf("running tms on %d genes", length(goi))

  f <- function(targetGene){
      tryCatch({
         fimo.threshold <- fimoThresholdAsNegativeExponent
         results.file.path <- file.path(sprintf("./fimo%d", fimo.threshold),
                                        sprintf("%s.RData", targetGene))
         if(file.exists(results.file.path))
             printf("results file exists, skipping: %s", results.file.path)
         else
             build.model(targetGene, fimo.threshold)
         },
      error = function(e){
         print("tms error")
         system(sprintf("touch %s", results.file.path))
         print(targetGene)
         print(e)
         })
      } # f

  tbls.all <- lapply(goi, f)
  names(tbls.all) <- goi
  invisible(tbls.all)

} # buildAll
#------------------------------------------------------------------------------------------------------------------------
collectResults <- function(directory, outfile)
{
   full.path.to.directory <- sprintf("~/github/TrenaMultiScore/misc/erythropoiesis/marjorieDemos/%s", directory)
   rdata.files <- list.files(path=full.path.to.directory, pattern=".RData")
   printf("RData file count: %d", length(rdata.files))

   tbls <- lapply(head(rdata.files, n=-1), function(file){
       full.path <- file.path(directory, file)
       if(file.size(full.path) > 1000){
           tbl <- get(load(full.path))
           printf("%30s: %d rows, %d cols", file, nrow(tbl), ncol(tbl))
           return(tbl)
       }})

   tbl.all <- do.call(rbind, tbls)
   printf("about to save combined table (%d rows)  to %s", nrow(tbl.all), outfile)
   if(!is.null(outfile))
      save(tbl.all, file=outfile)
   invisible(tbl.all)

} # collectResults
#------------------------------------------------------------------------------------------------------------------------
to.sqlite <- function(tbl, sqlite.filename)
{
   db <- dbConnect(SQLite(), sqlite.filename, synchronous=NULL)


   system.time(dbWriteTable(db, name="tms", value=tbl, overwrite=TRUE))  # less than 30 seconds
   dbDisconnect(db)

} # to.sqlite
#------------------------------------------------------------------------------------------------------------------------
PriceLab/TrenaMultiScore documentation built on Aug. 22, 2022, 8:01 a.m.