R/TrenaMultiScore.R

Defines functions .queryBocaATACseq .queryHintFootprintRegionsFromDatabase .queryBrandLabATACseq TrenaMultiScore

Documented in TrenaMultiScore

# 1) find tss, chrom, strand, min and max of genehancer regions (aka, "geneRegion")
# 2) load open chromatin in geneRegion: atac-seq or database footprint regions
# 3) find all TFBS in atac-seq regions
# 4) score all TFBS for conservation
# 5) determine distance from TSS for all TFBS
#------------------------------------------------------------------------------------------------------------------------
#' @import methods
#' @importFrom AnnotationDbi select
#' @import org.Hs.eg.db
#' @import RPostgreSQL
#' @import ghdb
#'
#' @import GenomicRanges
#' @import GenomicScores
#' @import phastCons7way.UCSC.hg38
#' @import phastCons100way.UCSC.hg38
#' @import MotifDb
#' @import motifmatchr
#' @importFrom universalmotif convert_motifs
#' @importFrom TFBSTools PFMatrixList
#' @import annotatr
#'
#' @title TrenaMultiScore-class
#'
#' @name TrenaMultiScore-class
#' @rdname TrenaMultiScore-class
#' @aliases TrenaMultiScore
#' @exportClass TrenaMultiScore
#'

.TrenaMultiScore <- setClass("TrenaMultiScore",
                             representation=representation(
                                trenaProject="TrenaProject",
                                targetGene="character",
                                motifDb="MotifList",
                                state="environment"
                                )
                             )
#------------------------------------------------------------------------------------------------------------------------
setGeneric('getProject', signature='obj', function(obj) standardGeneric('getProject'))
setGeneric('getGeneHancerRegion', signature='obj', function(obj) standardGeneric('getGeneHancerRegion'))
setGeneric('findOpenChromatin', signature='obj', function(obj, chrom=NA, start=NA, end=NA,
                                                          intersect.with.geneHancer=FALSE,
                                                          use.merged.atac=FALSE)

              standardGeneric('findOpenChromatin'))
#setGeneric('explicitlySetOpenChromatin', signature='obj', function(obj, tbl) standardGeneric('explicitlySetOpenChromatin'))
setGeneric('getOpenChromatin', signature='obj', function(obj) standardGeneric('getOpenChromatin'))
setGeneric('findFimoTFBS', signature='obj', function(obj, motifs=NA, fimo.threshold=NA, chrom=NA, start=NA, end=NA, genome="hg38")
              standardGeneric('findFimoTFBS'))
setGeneric('findMoodsTFBS', signature='obj', function(obj, motifs=NA, moods.threshold=NA, chrom=NA, start=NA, end=NA)
              standardGeneric('findMoodsTFBS'))
setGeneric('scoreMotifHitsForConservation', signature='obj', function(obj) standardGeneric('scoreMotifHitsForConservation'))
setGeneric('getMultiScoreTable', signature='obj', function(obj) standardGeneric('getMultiScoreTable'))
setGeneric('getTargetGeneInfo', signature='obj', function(obj) standardGeneric('getTargetGeneInfo'))
setGeneric('addDistanceToTSS', signature='obj', function(obj) standardGeneric('addDistanceToTSS'))
setGeneric('scoreMotifHitsForGeneHancer', signature='obj', function(obj) standardGeneric('scoreMotifHitsForGeneHancer'))
setGeneric('addGeneExpressionCorrelations', signature='obj',
           function(obj, mtx, featureName="cor", colnames=list(),
                    method="pearson") standardGeneric('addGeneExpressionCorrelations'))
setGeneric('addGenicAnnotations', signature='obj', function(obj) standardGeneric('addGenicAnnotations'))
setGeneric('addChIP', signature='obj', function(obj) standardGeneric('addChIP'))
setGeneric('addRnaBindingProteins', signature='obj', function(obj) standardGeneric('addRnaBindingProteins'))
setGeneric('getRnaBindingProteins', signature='obj', function(obj) standardGeneric('getRnaBindingProteins'))
setGeneric('scoreMotifHitsForOpenChromatin', signature='obj', function(obj) standardGeneric('scoreMotifHitsForOpenChromatin'))

setGeneric('add.eQTLs', signature='obj', function(obj, tbl.eqtls, pval, abs.beta, eqtl.title)
    standardGeneric('add.eQTLs'))
#------------------------------------------------------------------------------------------------------------------------
#' Define an object of class TrenaMultiScore
#'
#' @description
#' Expression, variant and covariate data for the genes of interest (perhaps unbounded) for pre-term birth studies
#'
#' @rdname TrenaMultiScore-class
#' @param trenaProject a concrete subclass of TrenaProject
#' @param targetGene a character string
#' @param tbl.fimo a data.frame, empty or with fimo matches calculated separate
#' @param quiet logical
#'
#' @export
#'
#' @return An object of the TrenaMultiScore class
#'

TrenaMultiScore <- function(trenaProject, targetGene, tbl.fimo=data.frame(), tbl.oc=data.frame(), quiet=TRUE)
{
   setTargetGene(trenaProject, targetGene)
   state <- new.env(parent=emptyenv())
   state$quiet <- quiet
   state$openChromatin <- tbl.oc
   state$genehancer <- data.frame()
   state$fimo <- tbl.fimo

   .TrenaMultiScore(trenaProject=trenaProject, targetGene=targetGene, motifDb=MotifDb, state=state)

} # TrenaMultiScore, the constructor
#------------------------------------------------------------------------------------------------------------------------
#' return the TMS project
#'
#' @description
#' get a reference to the TrenaProject concrete subclass included in this object
#'
#' @rdname getProject
#' @return a TrenaProject instance
#'
#' @export
#'
setMethod('getProject', 'TrenaMultiScore',

     function(obj){
        obj@trenaProject
        }) # getProject

#------------------------------------------------------------------------------------------------------------------------
#' return the full extent of GeneHancer regions for the targetGene
#'
#' @description
#' GeneHancer often reports distal enhancers within ~1MB of the gene; return that region
#'
#' @rdname getGeneHancerRegion
#' @return a data.frame
#'
#' @export
#'
setMethod('getGeneHancerRegion', 'TrenaMultiScore',

     function(obj){
        ghdb <- GeneHancerDB()
        targetGene <- obj@targetGene
        if(targetGene == "C2orf40")  # todo: painful hack due to inconsistent names, soon fixed by GTEx and ROSMAP I hope
            targetGene <- "ECRG4"
        tbl.gh <- retrieveEnhancersFromDatabase(ghdb, targetGene, tissues="all")
        if(nrow(tbl.gh) == 0){
            return(data.frame())
            } # failed
        obj@state$genehancer <- tbl.gh
        start <- min(tbl.gh$start) - 1000
        end <- max(tbl.gh$end) + 1000
        width <- 1 + end - start
        data.frame(chrom=tbl.gh$chrom[1],
                   start=start, end=end, width=width,
                   stringsAsFactors=FALSE)

        }) # getGeneHancerRegion

#------------------------------------------------------------------------------------------------------------------------
#' find open chromatin in the current gene region, or (if supplied) somewhere else
#'
#' @description
#' Open chromatin from DNase footprints or scATAC-seq is retrieved, saved for later use
#'
#' @rdname findOpenChromatin
#' @param obj a TrenaMultiScore object
#' @param chrom typically NA
#' @param chrom start, integer, chromosomal location, default NA
#' @param chrom end, integer, chromosomal location, default NA
#'
#' @export
#'
setMethod('findOpenChromatin', 'TrenaMultiScore',

     function(obj, chrom=NA, start=NA, end=NA, intersect.with.geneHancer= FALSE, use.merged.atac=FALSE){
         if(is.na(chrom)){
            tbl.x <- getGeneHancerRegion(obj)
            chrom <- tbl.x$chrom
            start <- tbl.x$start
            end   <- tbl.x$end
            }
        if("TrenaProjectErythropoiesis" %in% is(getProject(obj))){
           obj@state$openChromatin <- .queryBrandLabATACseq(chrom, start, end, use.merged.atac)
           if(!obj@state$quiet)
               message(sprintf("regions of Brand ATACseq open chromatin: %d", nrow(obj@state$openChromatin)))
           }
        else if("TrenaProjectAD" %in% is(getProject(obj))){
           #obj@state$openChromatin <- .queryBocaATACseq(chrom, start, end)
           obj@state$openChromatin <- .queryHintFootprintRegionsFromDatabase("brain_hint_16", chrom, start, end)
           if(!obj@state$quiet)
              message(sprintf("regions of open chromatin: %d", nrow(obj@state$openChromatin)))
           }
        else stop(sprintf("no support for open chromatin retrieval in %s", getProject(obj)@projectName))
        if(intersect.with.geneHancer){
            tbl.oc <- obj@state$openChromatin
            tbl.gh <- getGeneRegulatoryRegions(getProject(obj))
            tbl.ov <- as.data.frame(findOverlaps(GRanges(tbl.oc), GRanges(tbl.gh)))
            if(nrow(tbl.ov) == 0){
                tbl.filtered <- data.frame()
            } else {
               tbl.filtered <- tbl.oc[tbl.ov[,1],]
               }
            message(sprintf("intersecting oc with gh: from %d to %d regions", nrow(tbl.oc), nrow(tbl.filtered)))
            obj@state$openChromatin <- tbl.filtered
            }
        nrow(obj@state$openChromatin)
        })


#------------------------------------------------------------------------------------------------------------------------
#' retrieve currently assigned open chromatin
#'
#' @description
#' Return a data.frame with the open chromatin previously retrieved
#'
#' @rdname getOpenChromatin
#' @param obj a TrenaMultiScore object
#' @return data.frame
#'
#' @export
#'
setMethod('getOpenChromatin', 'TrenaMultiScore',

     function(obj){
         obj@state$openChromatin
         })


#------------------------------------------------------------------------------------------------------------------------
#' get scored FIMO motif binding sites
#'
#' @description
#' call FimoBatch to get data.frame of all matches above threshold
#'
#' @rdname getFimoTFBS
#'
#' @param motifs a list of MotifDb items, JASPAR2019 hsapiens by default
#' @param fimo.threshold 1e-4 by default
#' @param chrom geneHancerRegion chrom by default
#' @param start geneHancerRegion start by default
#' @param end geneHancerRegion end by default
#' @param genome character string, "hg38" by default
#'
#' @return a data.frame
#'
#' @export
#'
setMethod('findFimoTFBS', 'TrenaMultiScore',

    function(obj, motifs=list(), fimo.threshold=NA, chrom=NA, start=NA, end=NA, genome="hg38"){

           # fimo table may have been build separately, do not repeat.
       if(nrow(obj@state$fimo) > 1) return

       tbl.fp <- getOpenChromatin(obj)
       if(!obj@state$quiet)
            message(sprintf("--- findFimoTFBS in %d regions of open chromatin", nrow(tbl.fp)))

       if(nrow(tbl.fp) == 0)
          stop("TrenaMultiScore::getFimoTFBS error: no open chromatin regions previously identified.")

       if(length(motifs) == 0)
          motifs <- query(obj@motifDb, c("sapiens"), c("hocomocov11a-core", "jaspar2018"))

       if(is.na(fimo.threshold))
          fimo.threshold <- 1e-4
       tbl.gh <- getGeneHancerRegion(obj)
       if(is.na(chrom))
          chrom <- tbl.gh$chrom
       if(is.na(start))
          start <- tbl.gh$start
       if(is.na(end))
          end <- tbl.gh$end

       meme.file <- "tmp.meme"
       MotifDb::export(motifs, con=meme.file, format="meme")
       source("~/github/fimoService/batchMode/fimoBatchTools.R")
       if(!obj@state$quiet)
          message(sprintf("--- calling fimoBatch on %d regions, %e threshold", nrow(tbl.fp), fimo.threshold))
       tbl.fimo <- fimoBatch(tbl.fp, matchThreshold=fimo.threshold, genomeName=genome, pwmFile=meme.file)
       obj@state$fimo <- tbl.fimo
       if(!obj@state$quiet)
         message(sprintf("--- tbl.fimo stored with %d rows, %d columns", nrow(tbl.fimo), ncol(tbl.fimo)))
       }) # findFimoTFBS

#------------------------------------------------------------------------------------------------------------------------
#' get scored moods motif binding sites
#'
#' @description
#' call moods (wrapped by motifmatchr package) to get data.frame of all matches above threshold
#'
#' @rdname getMoodsTFBS
#'
#' @param motifs a list of MotifDb items, JASPAR2019 hsapiens by default
#' @param moods.threshold 1e-4 by default
#' @param chrom geneHancerRegion chrom by default
#' @param start geneHancerRegion start by default
#' @param end geneHancerRegion end by default
#'
#' @return a data.frame
#'
#' @export
#'
setMethod('findMoodsTFBS', 'TrenaMultiScore',

    function(obj, motifs=NA, moods.threshold=NA, chrom=NA, start=NA, end=NA){

       tbl.oc <- getOpenChromatin(obj)
       if(nrow(tbl.oc) == 0)
          stop("TrenaMultiScore::getMoodsTFBS error: no open chromatin regions previously identified.")

       if(is.na(motifs))
           motifs <- query(obj@motifDb, c("sapiens"), c("hocomoco", "jaspar2018"))

       if(is.na(moods.threshold))
          moods.threshold <- 1e-4

       gr.oc <- GRanges(tbl.oc)
       motifs.pfmatrix <- lapply(motifs, function(motif) convert_motifs(motif, "TFBSTools-PFMatrix"))
       motifs.pfmList <- do.call(PFMatrixList, motifs.pfmatrix)
       gr.match <- matchMotifs(motifs.pfmList, gr.oc, genome="hg38", out="positions", p.cutoff=moods.threshold)
       tbl.moods <- as.data.frame(gr.match)
       if(ncol(tbl.moods) == 8){  # the expected number
          if(nrow(tbl.moods) > 0){
             tfs <- mcols(MotifDb[tbl.moods$group_name])$geneSymbol
             tbl.moods$tf <- tfs
             }
          colnames(tbl.moods)[grep("seqnames", colnames(tbl.moods))] <- "chrom"
          colnames(tbl.moods)[grep("group_name", colnames(tbl.moods))] <- "motifName"
          colnames(tbl.moods)[grep("score", colnames(tbl.moods))] <- "moodsScore"
          tbl.moods$chrom <- as.character(tbl.moods$chrom)
          tbl.moods$strand <- as.character(tbl.moods$strand)
          }
       obj@state$moods <- tbl.moods
       sprintf("tbl.moods stored with %d rows, %d columns", nrow(tbl.moods), ncol(tbl.moods))
       }) # findMoodsTFBS

#------------------------------------------------------------------------------------------------------------------------
.queryBrandLabATACseq <- function(chrom.loc, start.loc, end.loc, use.merged.atac)
{
  tbl.atac <- get(load("~/github/TrenaProjectErythropoiesis/misc/multiScore/brandAtacSeqCuration/tbl.atac.fp.RData"))
  if(use.merged.atac){
      tbl.atac <- get(load("~/github/TrenaProjectErythropoiesis/inst/extdata/genomicRegions/tbl.atacMerged.RData"))
      }
  subset(tbl.atac, chrom==chrom.loc & start >= start.loc & end <= end.loc)

} # .queryBrandLabATACseq
#------------------------------------------------------------------------------------------------------------------------
.queryHintFootprintRegionsFromDatabase <- function(database.name, chrom.loc, start.loc, end.loc)
{
  if(grepl("hagfish", Sys.info()[["nodename"]])){
      suppressWarnings(
          db.access.test <- try(system("/sbin/ping -c 1 khaleesi", intern=TRUE, ignore.stderr=TRUE)))
      if(length(db.access.test) == 0)
          stop("khaleesi database server unavailable")
  }

   db <- dbConnect(PostgreSQL(), user= "trena", password="trena", dbname="brain_hint_16", host="khaleesi")
   query <- sprintf("select * from regions where chrom='%s' and start >= %d and endpos <= %d",
                    chrom.loc, start.loc, end.loc)
   tbl <- dbGetQuery(db, query)
   dbDisconnect(db)

   if(nrow(tbl) > 0){
      tbl <- tbl[, c("chrom", "start", "endpos")]
      colnames(tbl) <- c("chrom", "start", "end")
      }

   tbl

} # .queryHintFootprintRegionsFromDatabase
#------------------------------------------------------------------------------------------------------------------------
.queryBocaATACseq <- function(chrom.loc, start.loc, end.loc)
{
  f <- "~/github/TrenaProjectAD/misc/multiScore/boca/tbl.boca.RData"
  tbl.boca <- get(load(f))

  tbl.tmp <- subset(tbl.boca, chrom==chrom.loc & start >= start.loc & end <= end.loc)
  tbl.reduced <- as.data.frame(reduce(GRanges(tbl.tmp)), row.names=NULL)
  colnames(tbl.reduced)[1] <- "chrom"
  tbl.reduced$chrom <- as.character(tbl.reduced$chrom)

  tbl.reduced

} # .queryBocaATACseq
#------------------------------------------------------------------------------------------------------------------------
#' add open chromatin intersection info (none, partial, complete for the currently held fimo table
#'
#' @description
#'   adds "none", "partial", "complete" annotation to the "oc" column for each motif hit in the
#'   already built (or supplied) fimo table, using the current obj@state$openChromatin table
#'
#'
#' @rdname scoreMotifHitsForConservation
#'
#' @param obj a TrenaMultiScore object
#' @return a data.frame
#'
#' @export
#'
setMethod('scoreMotifHitsForOpenChromatin', 'TrenaMultiScore',

    function(obj){

      tbl.fimo <- obj@state$fimo
      tbl.oc <- obj@state$openChromatin
      if(nrow(tbl.oc) == 0){
         tbl.fimo$oc <- FALSE
         }
      if(nrow(tbl.oc) > 0){
         gr.fimo <- GRanges(seqnames=tbl.fimo$chrom, IRanges(tbl.fimo$start, tbl.fimo$end))
         gr.oc   <- GRanges(seqnames=tbl.oc$chrom, IRanges(tbl.oc$start, tbl.oc$end))
         tbl.ov  <- as.data.frame(findOverlaps(gr.oc, gr.fimo, type="any"))
         openChromatin <- rep(FALSE, nrow(tbl.fimo))
         if(nrow(tbl.ov) > 0){
            hits <- unique(tbl.ov$subjectHits)
            openChromatin[hits] <- TRUE
            }
         tbl.fimo$oc <- openChromatin
         }
      rownames(tbl.fimo) <- NULL
      obj@state$fimo <- tbl.fimo
      }) # scoreMotifHitsForOpenChromatin


#------------------------------------------------------------------------------------------------------------------------
#' add conservation scores to the currently held fimo table
#'
#' @description
#'   adds several conservation scores, each an average, to each motif hit in the already build fimo table
#'
#' @rdname scoreMotifHitsForConservation
#'
#' @param obj a TrenaMultiScore object
#' @return a data.frame
#'
#' @export
#'
setMethod('scoreMotifHitsForConservation', 'TrenaMultiScore',

    function(obj){

      tbl.fimo <- obj@state$fimo
      if(nrow(tbl.fimo) == 0)
         stop("TrenaMultiScore::scoreMotifHitsForConservation error: no fimo hits yet identified.")

        # phastCons100way.UCSC.hg38
        # phastCons30way.UCSC.hg38

       # browser()
       tbl.cons <- as.data.frame(gscores(phastCons7way.UCSC.hg38,
                                         GRanges(tbl.fimo[, c("chrom", "start", "end")])), stringsAsFactors=FALSE)
       tbl.fimo$phast7 <- round(tbl.cons$default, digits=2)

       #if(grepl("khaleesi", Sys.info()[["nodename"]])){
       #   print(load("~/github/TrenaMultiScore/inst/extdata/phastCons30way.UCSC.hg38.downloaded.RData"))
       #   tbl.cons <- as.data.frame(gscores(phastCons30way.UCSC.hg38.downloaded,
       #                                     GRanges(tbl.fimo[, c("chrom", "start", "end")])), stringsAsFactors=FALSE)
       #   }
       #else{
       #   tbl.cons <- as.data.frame(gscores(phastCons30way.UCSC.hg38,
       #                                     GRanges(tbl.fimo[, c("chrom", "start", "end")])), stringsAsFactors=FALSE)
       #   }

       #tbl.fimo$phast30 <- round(tbl.cons$default, digits=2)

       tbl.cons <- as.data.frame(gscores(phastCons100way.UCSC.hg38,
                                         GRanges(tbl.fimo[, c("chrom", "start", "end")])), stringsAsFactors=FALSE)
       tbl.fimo$phast100 <- round(tbl.cons$default, digits=2)

       rownames(tbl.fimo) <- NULL
       obj@state$fimo <- tbl.fimo
       }) # scoreMotifHitsForConservation

#------------------------------------------------------------------------------------------------------------------------
#' returns the current state of scored motif matches
#'
#' @description
#'   can be called at any time
#'
#' @rdname getMultiScoreTable
#'
#' @param obj a TrenaMultiScore object
#'
#' @return a data.frame
#'
#' @export
#'
setMethod('getMultiScoreTable', 'TrenaMultiScore',

    function(obj){

      tbl.fimo <- obj@state$fimo
      if("score" %in% colnames(tbl.fimo))
          colnames(tbl.fimo)[grep("score", colnames(tbl.fimo))] <- "fimo_score"
      if("p.value" %in% colnames(tbl.fimo))
          colnames(tbl.fimo)[grep("p.value", colnames(tbl.fimo))] <- "fimo_pvalue"

      if("motif_id" %in% colnames(tbl.fimo)){ # can be quite long, move it to the end
         current.index <- grep("motif_id", colnames(tbl.fimo))
         colnames.in.preferred.order <- c(colnames(tbl.fimo)[-current.index], "motif_id")
         tbl.fimo <- tbl.fimo[,colnames.in.preferred.order]
         } # motif_id column

      invisible(unique(tbl.fimo))

      }) # getMultiScoreTable

#------------------------------------------------------------------------------------------------------------------------
#' append 'TSS' column to the accumulating table
#'
#' @description
#'   append 'TSS' column to the accumulating table, recording distance from the main
#'   transcript's TSS to the start of the motif hit, negative values for upstream locations
#'
#' @rdname addDistanceToTSS
#'
#' @param obj a TrenaMultiScore object
#'
#' @return None
#'
#' @export
#'
setMethod('addDistanceToTSS', 'TrenaMultiScore',

    function(obj){

      coi <- c("tss", "strand", "chrom", "start", "end")
      targetGene.info <- as.list(getTranscriptsTable(getProject(obj))[coi])
      tbl <- getMultiScoreTable(obj)
      tss <- NA_integer_
      if(all(c("start", "end", "strand")  %in% names (targetGene.info))){
          if(length(targetGene.info$start) > 0)
              tss <- (targetGene.info$tss - tbl$start)  * (targetGene.info$strand) * -1
          } # if start,end,strand all found
      tbl$tss <- tss
      obj@state$fimo <- tbl
      }) # addDistanceToTSS

#------------------------------------------------------------------------------------------------------------------------
#' get basic stats on the primary transcript
#'
#' @description
#'   get basic stats on the primary transcript

#' @rdname getTargetGeneInfo
#'
#' @param obj a TrenaMultiScore object
#'
#' @return None
#'
#' @export
#'
setMethod('getTargetGeneInfo', 'TrenaMultiScore',

    function(obj){

      coi <- c("geneSymbol", "tss", "strand", "chrom", "start", "end", "ensg")
      as.list(getTranscriptsTable(getProject(obj))[coi])
      }) # getTargetGeneInfo

#------------------------------------------------------------------------------------------------------------------------
#' attach a genehancer score to each motif hit, 0 if none
#'
#' @description
#'   adds several conservation scores, each an average, to each motif hit in the already build fimo table
#'
#' @rdname scoreMotifHitsForGeneHancer
#'
#' @param obj a TrenaMultiScore object
#' @return a data.frame
#'
#' @export
#'
setMethod('scoreMotifHitsForGeneHancer', 'TrenaMultiScore',

    function(obj){

      if(nrow(obj@state$genehancer) == 0)
         getGeneHancerRegion(obj)
      tbl.gh <- obj@state$genehancer
      if(nrow(tbl.gh) > 0){
         if(!grepl("chr", tbl.gh$chrom[1]))
             tbl.gh$chrom <- paste0("chr", tbl.gh$chrom)
         } # if tbl.gh
      tbl.fimo <- obj@state$fimo
      if(nrow(tbl.fimo) == 0)
         stop("TrenaMultiScore::scoreMotifHitsForGeneHancer error: no fimo hits yet identified.")

      tbl.fimo$gh <- rep(0, nrow(tbl.fimo))
      if(nrow(tbl.gh) > 0){
         tbl.ov <- as.data.frame(findOverlaps(GRanges(tbl.fimo), GRanges(tbl.gh)))
         colnames(tbl.ov) <- c("fimo", "gh")
         tbl.fimo$gh[tbl.ov$fimo] <- round(tbl.gh$combinedscore[tbl.ov$gh], digits=2)
         }

      rownames(tbl.fimo) <- NULL
      obj@state$fimo <- tbl.fimo
      }) # scoreMotifHitsForGeneHancer

#------------------------------------------------------------------------------------------------------------------------
#' add a tf/targetGene correlations score for those TFs also in the expression matrix
#'
#' @description
#' add a tf/targetGene correlations score for those TFs also in the expression matrix
#'
#' @rdname addGeneExpressionCorrelations
#'
#' @param obj a TrenaMultiScore object
#' @param mtx a numerical matrix, genes are rownames, samples are colnames
#' @param featureName a character string, default "cor", data.frame column where results are stored
#' @param colnames list of character strings, limits scope of correlation calcuation. default empty list: use all columns
#' @param method a character string, either "spearman" or "pearson"
#' @return None
#'
#' @export
#'
setMethod('addGeneExpressionCorrelations', 'TrenaMultiScore',

    function(obj, mtx, featureName="cor", colnames=list(), method="pearson"){

      stopifnot(method %in% c("spearman", "pearson"))

      if(!obj@state$quiet){
         printf("TMS::addGeneExpressionCorrelations, '%s'", featureName)
         }

      if(length(colnames) > 0){
         stopifnot(all(colnames %in% colnames(mtx)))
         mtx <- mtx[, colnames]
         }

      tbl.fimo <- obj@state$fimo
      if(nrow(tbl.fimo) == 0)
         stop("TrenaMultiScore::addGeneExpressionCorrelations error: no fimo hits yet identified.")

      f <- function(tf, targetGene){
         if(tf %in% rownames(mtx))
           return(cor(mtx[targetGene,], mtx[tf,], method=method, use="pairwise.complete.obs"))
         else return(NA)
         }

      if(!obj@state$quiet){
         printf("about to run spearman cor on %d tfs from tbl.fimo", length(tbl.fimo$tf))
         }

      suppressWarnings({
          tfs <- sort(unique(tbl.fimo$tf))
          if(!obj@state$quiet)
             printf("starting creation of cor.map for %d tfs", length(tfs))
          targetGene <- obj@targetGene
             # TMS uses genehancer and TrenaProjectHG38, with gene symbol C2orf49-DT"
             # but GTEx eQTLs and expression matrices use "RP11-332H14.2"
             # here is nasty hack to very temporarily work around this
             # todo: correct incoming GTEx data to use C2orf49-DT
          cor.map <- unlist(lapply(tfs, function(tf) f(tf, targetGene)))
          if(!obj@state$quiet)
             printf("created cor.map for %d tfs", length(tfs))
          names(cor.map) <- tfs
          if(!obj@state$quiet)
             printf("starting use of cor.map for %d tfs", length(tfs))
          cor <- unlist(lapply(tbl.fimo$tf, function(tf) cor.map[[tf]]))
          if(!obj@state$quiet)
             printf("used cor.map for %d tfs", length(tfs))
          })


      cor <- round(cor, digits=2)
      tbl.fimo[, featureName] <- cor
      obj@state$fimo <- tbl.fimo
      }) # addGeneExpressionCorrelations

#------------------------------------------------------------------------------------------------------------------------
#' add intron, exon, utr, promoter, etc annotations
#'
#' @description
#' add intron, exon, utr, promoter, etc annotations
#'
#' @rdname addGenicAnnotations
#'
#' @param obj a TrenaMultiScore object
#' @param mtx a numerical matrix, genes are rownames, samples are colnames
#' @return None
#'
#' @export
#'
setMethod('addGenicAnnotations', 'TrenaMultiScore',

    function(obj){

      tbl.fimo <- obj@state$fimo
      if(nrow(tbl.fimo) == 0)
         stop("TrenaMultiScore::addGenicAnnotations error: no fimo hits yet identified.")

      existing.colnames <- colnames(tbl.fimo)
      gr <- GRanges(tbl.fimo)
        # anno <- build_annotations(genome="hg38", annotations=c("hg38_basicgenes", "hg38_genes_intergenic"))
      anno <- get(load(system.file(package="TrenaMultiScore", "extdata", "genomeAnnotations.RData")))
      gr.annoResults <- annotate_regions(regions=gr, annotations=anno, ignore.strand=TRUE, quiet=FALSE)
      tbl.anno <- as.data.frame(gr.annoResults, row.names=NULL)
      coi <- c(existing.colnames, "annot.symbol", "annot.type")
      coi[1] <- "seqnames"  # bioc-speak for chromosome
      na.symbols <- which(is.na(tbl.anno$annot.symbol))
      if(length(na.symbols) > 0)
          tbl.anno$annot.symbol[na.symbols] <- "NA"
      tbl.aggregated <- aggregate(cbind(annot.type, annot.symbol) ~ seqnames+start+end+strand+tf,
                                  data=tbl.anno, function(x) paste(unique(x), collapse=","))
      tbl.aggregated$annot.type <- gsub("hg38_genes_", "", tbl.aggregated$annot.type)
      colnames(tbl.aggregated)[1] <- "chrom"
      tbl.aggregated$chrom <- as.character(tbl.aggregated$chrom)

      tbl.fimo <- merge(tbl.fimo, tbl.aggregated)
      obj@state$fimo <- tbl.fimo
      }) # addGenicAnnotations

#------------------------------------------------------------------------------------------------------------------------
#' add 1 if a motif hit intersects a ChIP-seq hit, otherwise 0.
#'
#' @description
#' add 1 if a motif hit intersects a ChIP-seq hit, otherwise 0.  full-with ChIP peaks are used.
#'
#' @rdname addChIP
#'
#' @param obj a TrenaMultiScore object
#' @param mtx a numerical matrix, genes are rownames, samples are colnames
#' @return None
#'
#' @export
#'
setMethod('addChIP', 'TrenaMultiScore',

    function(obj){

      tbl.fimo <- obj@state$fimo
      if(nrow(tbl.fimo) == 0)
         stop("TrenaMultiScore::addChIP error: no fimo hits yet identified.")

      has.ChIP <- rep(FALSE, nrow(tbl.fimo))
      chrom <- tbl.fimo$chrom[1]
      start <- min(tbl.fimo$start) - 1000
      end <- max(tbl.fimo$end) + 1000
      tbl.chip <- getChipSeq(getProject(obj), chrom, start, end)
      tbl.chip <- subset(tbl.chip, tf %in% tbl.fimo$tf)
      tbl.ov <- as.data.frame(findOverlaps(GRanges(tbl.fimo), GRanges(tbl.chip), type="any"))
      tbl.combined <- cbind(tbl.fimo[tbl.ov$queryHits,], tbl.chip[tbl.ov$subjectHits,])
      colnames(tbl.combined)[max(grep("tf", colnames(tbl.combined)))] <- "tf.chip"
      tbl.fimoWithChip <- subset(tbl.combined, tf==tf.chip)
          # some chip hits are reported in multiple cell lines, which we don't care about yet
          # remove them
      tbl.fimoWithChip <- tbl.fimoWithChip[-which(duplicated(tbl.fimoWithChip[, 1:4])),]
      rownames(tbl.fimoWithChip) <- NULL
      sig.chip <- with(tbl.fimoWithChip, sprintf("%s:%d-%d.%s", chrom, start, end, tf))
      sig.fimo <- with(tbl.fimo, sprintf("%s:%d-%d.%s", chrom, start, end, tf))
      chip.hits <- match(sig.chip, sig.fimo)
      if(length(chip.hits) > 0)
         has.ChIP[chip.hits] <- TRUE

      tbl.fimo$chip <- has.ChIP
      obj@state$fimo <- tbl.fimo
      }) # addChIP

#------------------------------------------------------------------------------------------------------------------------
#' query remote khaleesi (ingested POSTAR2) database for rna binding protein binding sites for the target gene
#'
#' @description
#' POSTAR2 curated many clip-seq (and related) data sources; we put them in a postgres database, queried here by gene
#'
#' @rdname addRnaBindingProteins
#'
#' @param obj a TrenaMultiScore object
#' @return data.frame
#'
#' @export
#'
setMethod('addRnaBindingProteins', 'TrenaMultiScore',

    function(obj){
      if(!obj@state$quiet)
        message(sprintf("querying rbp database for %s", obj@targetGene))
      geneRegDB <- dbConnect(PostgreSQL(), user= "trena", password="trena",
                             dbname="genereg2021", host="khaleesi")
      query <- sprintf("select * from rbp where target='%s'", obj@targetGene)
      obj@state$tbl.rbp <- dbGetQuery(geneRegDB, query)
      dbDisconnect(geneRegDB)
      if(!obj@state$quiet)
        message(sprintf("rbp binding sites found: %d", nrow(obj@state$tbl.rbp)))
      invisible(obj@state$tbl.rbp)
      }) # addRnaBindingProteins

#------------------------------------------------------------------------------------------------------------------------
#' return previously queried data.frame of rna binding protein binding sites for the target gene
#'
#' @description
#' POSTAR2 curated many clip-seq (and related) data sources; current value is returned here
#'
#' @rdname getRnaBindingProteins
#'
#' @param obj a TrenaMultiScore object
#' @return data.frame
#'
#' @export
#'
setMethod('getRnaBindingProteins', 'TrenaMultiScore',

    function(obj){
      invisible(obj@state$tbl.rbp)
      }) # getRnaBindingProteins

#------------------------------------------------------------------------------------------------------------------------
#' @description
#' add eQTL annotations to all appropriate regions
#'
#' @rdname addRnaBindingProteins
#'
#' @param obj a TrenaMultiScore object
#' @param tbl.eqtls data.frame, with chrom, start, end, and tissue category columns
#' @param pval numeric only consider eQTLs with pval at or below this threshold
#' @param abs.beta numeric only consider eQTLs with abs(beta) at or above this threshold
#' @return nothing
#'
#' @export
#'
setMethod('add.eQTLs', 'TrenaMultiScore',

    function(obj, tbl.eqtls, pval, abs.beta, eqtl.title){
        tbl.fimo <- obj@state$fimo
        required.cols <- c("gene", "pvalue", "beta", "chrom", "hg38", "rsid")
        stopifnot(all(required.cols %in% colnames(tbl.eqtls)))
        tbl.eqtls.sub <- subset(tbl.eqtls, gene==obj@targetGene & pvalue <= pval & abs(beta) >= abs.beta)
        if(!grepl("chr", tbl.eqtls.sub$chrom[1]))
            tbl.eqtls.sub$chrom <- paste0("chr", tbl.eqtls.sub$chrom)
        tbl.ov <- as.data.frame(findOverlaps(GRanges(tbl.fimo),
                                             GRanges(seqnames=tbl.eqtls.sub$chrom, IRanges(tbl.eqtls.sub$hg38))))
        colnames(tbl.ov) <- c("fimo", "eqtl")
        if(nrow(tbl.ov) > 0){
           tbl.info <- as.data.frame(table(tbl.ov$fimo), stringsAsFactors=FALSE)
           colnames(tbl.info) <- c("fimoIndex", "count")
           tbl.info$fimoIndex <- as.integer(tbl.info$fimoIndex)
             #tbl.info$rsid <- tbl.eqtls.sub$rsid[tbl.info$eqtlIndex]
           eqtlCount <- rep(0, nrow(tbl.fimo))
           eqtlCount[tbl.info$fimoIndex] <- tbl.info$count
             #eqtlVariant <- rep(NA, nrow(tbl.fimo))
             #eqtlVariant[tbl.info$fimoIndex] <- tbl.info$rsid
           tbl.fimo[, eqtl.title] <- eqtlCount
             #tbl.fimo[, sprintf("%s.rsid", eqtl.title)] <- eqtlVariant
           obj@state$fimo <- tbl.fimo
           } # if some overlaps
        }) # add.eQTLs

#------------------------------------------------------------------------------------------------------------------------
PriceLab/TrenaMultiScore documentation built on Aug. 22, 2022, 8:01 a.m.