inst/demos/gsm/config.R

library(RUnit)
library(GenomicRanges)
library(TrenaProjectLymphocyte)
library(MotifDb)

printf("--- reading config.R")

trenaProject <- TrenaProjectLymphocyte()

stopifnot(packageVersion("TrenaProject") >= "1.0.6")
stopifnot(packageVersion("TrenaProjectHG38") >= "0.99.45")
stopifnot(packageVersion("TrenaProjectLymphocyte") >= "0.99.21")

getExpressionMatrixNames(trenaProject)
matrix.name <- "GTEX.wholeBlood.rna-seq-geneSymbols.22330x407"

stopifnot(matrix.name %in% getExpressionMatrixNames(trenaProject))
mtx <- getExpressionMatrix(trenaProject, matrix.name)
some.key.genes <- c("IRF4", "TET2", "LAG3", "PDCD1", "HAVCR2")   # jocelyn has PD1 and TIM3 for the last two
all(some.key.genes %in% rownames(mtx))

tbl.geneHancer <- get(load(system.file(package="TrenaProject", "extdata", "genomeAnnotation", "geneHancer.v4.7.allGenes.RData")))
tbl.geneInfo <- get(load(system.file(package="TrenaProject", "extdata", "geneInfoTable_hg38.RData")))

OUTPUTDIR <- "/local/gsm/TrenaProjectLymphocyte/2019-may-17-GTEX.wholeBlood-geneSymbols-22330x407"

if(!file.exists(OUTPUTDIR))
   dir.create(OUTPUTDIR)

WORKERS <- 20
SOLVERS <- c("lasso", "lassopv", "pearson", "randomForest", "ridge", "spearman", "sqrtLasso")

LOGDIR <- file.path(OUTPUTDIR, "logs")

desired.footprint.databases <- getFootprintDatabaseNames(trenaProject)

#------------------------------------------------------------------------------------------------------------------------
# we want one gene for which we have genehancer regions, and one without.
# we also want these genes to exhibit high variance across samples in the expression matrix
# these provide good (though not exhaustive) tests of building a gsm:
#   - different method for
pickGuineaPigGenes <- function(mtx)
{
   tbl.geneHancer <- tbl.enhancers
   dim(tbl.geneHancer)

   load(system.file(package="TrenaProject", "extdata", "geneInfoTable_hg38.RData"))
   geneSymbols.in.mtx <- rownames(mtx)
   printf("geneSymbols all chromosomes: %d", length(geneSymbols.in.mtx))

   geneSymbols.with.enhancers <- intersect(geneSymbols.in.mtx, unique(tbl.geneHancer$geneSymbol))
   printf("genes with geneHancer annotation: %d/%d", length(geneSymbols.with.enhancers), length(geneSymbols.in.mtx))
   geneSymbols.without.enhancers <- setdiff(geneSymbols.in.mtx, unique(tbl.geneHancer$geneSymbol))  # 3174

     # choose one maximally variable gene from each set (with and w/o enhancers)
   x <- as.numeric(lapply(geneSymbols.without.enhancers,  function(gene) sd(mtx[gene,])))
   big.enough.sd <- fivenum(x)[4]
   genes.with.big.enough.sd <- rownames(mtx)[which(x > big.enough.sd)]
   candidates <- intersect(genes.with.big.enough.sd, geneSymbols.without.enhancers)   # 63
   length(candidates)
      # now make sure they are protein-coding genes
   also.protein.coding <- intersect(candidates, subset(tbl.geneInfo, type=="protein_coding")$geneSymbol) # 8
   length(also.protein.coding)
   # choose INKA2
   guineaPigGenes.without.enhancers <- "INKA2"

   x <- lapply(geneSymbols.with.enhancers,  function(gene) sd(mtx[gene,]))
   biggest.sd <- which(as.numeric(x) == max(as.numeric(x)))
   guineaPigGene.with.enhancers <- geneSymbols.with.enhancers[biggest.sd]        # GSTM1

   return(list(gene.with.enhancers=guineaPigGene.with.enhancers,
               gene.without.enhancers=guineaPigGenes.without.enhancers))

} # pickGuineaPigGenes
#------------------------------------------------------------------------------------------------------------------------
test_pickGuineaPigGenes <- function()
{
   printf("--- test_pickGuineaPigGenes")
   gpg <- pickGuineaPigGenes(mtx)
   gene.no <- gpg$gene.without.enhancers
   gene.yes <- gpg$gene.with.enhancers

      # check to see that with/without genehancer info is accurate
   checkTrue(gene.yes %in% tbl.geneHancer$geneSymbol)
   checkTrue(!gene.no %in% tbl.geneHancer$geneSymbol)

   checkTrue(sd(mtx[gene.yes,]) > 3)
   checkTrue(sd(mtx[gene.no,]) > 0.45)

} # test_pickGuineaPigGenes
#------------------------------------------------------------------------------------------------------------------------
# cory's AD method:
# if there are no enhancers entry: use tss +/- 5kb
# if there are enhancers, use those enhancers and +/- 2kb
#
determineRegulatoryRegions <- function(gene)
{
   tbl.concise <- tbl.geneInfo[grep(gene, tbl.geneInfo$geneSymbol), c("chrom", "tss")]
      # no need to figure strand since we go 2500bp in both directions

   if(gene %in% tbl.geneHancer$geneSymbol){
      setTargetGene(trenaProject, gene)
      tbl.enhancers <- getEnhancers(trenaProject)[, c("chrom", "start", "end")]
      shoulder <- 2000
      tbl.promoter <- data.frame(chrom=tbl.concise$chrom,
                                 start=tbl.concise$tss - shoulder,
                                 end=tbl.concise$tss + shoulder,
                                 stringsAsFactors=FALSE)
      tbl.regions <- rbind(tbl.promoter, tbl.enhancers)
      }

   if(!gene %in% tbl.geneHancer$geneSymbol){
      shoulder <- 5000
      tbl.regions <- data.frame(chrom=tbl.concise$chrom,
                                 start=tbl.concise$tss - shoulder,
                                 end=tbl.concise$tss + shoulder,
                                 stringsAsFactors=FALSE)
      }

   new.order <- order(tbl.regions$start, decreasing=FALSE)
   tbl.regions <- tbl.regions[new.order,]
   rownames(tbl.regions) <- NULL
   tbl.reduced <- as.data.frame(union(GRanges(tbl.regions), GRanges(tbl.regions)))[, c("seqnames", "start", "end")]
   colnames(tbl.reduced) <- c("chrom", "start", "end")
   tbl.reduced$chrom <- as.character(tbl.reduced$chrom)

   return(tbl.reduced)

} # determineRegulatoryRegions
#------------------------------------------------------------------------------------------------------------------------
test_determineRegulatoryRegions <- function()
{
   printf("--- test_determineRegulatoryRegions")

   tbl.gh <- determineRegulatoryRegions("GSTM1")   # has genehancer regions
   checkTrue(nrow(tbl.gh) >= 10)
   tss.gstm1 <- subset(tbl.geneInfo, geneSymbol=="GSTM1")$tss
   gr.10kpromoter <- GRanges(data.frame(chr="chr1", start=tss.gstm1-2000, end=tss.gstm1+2000, stringsAsFactors=FALSE))
   gr.regions <- GRanges(tbl.gh)
   checkEquals(length(findOverlaps(gr.10kpromoter, gr.regions, type="within")), 1)

   tbl.no.gh <- determineRegulatoryRegions("RBMXP2")    # no genehancer regions
   checkEquals(dim(tbl.no.gh), c(1,3))
   with(tbl.no.gh, checkEquals(end - start, 10000))

} # test_determineRegulatoryRegions
#------------------------------------------------------------------------------------------------------------------------
runTests <- function()
{
   test_pickGuineaPigGenes()
   test_determineRegulatoryRegions()

} # runTests
#------------------------------------------------------------------------------------------------------------------------
if(!interactive()){
   runTests()
   test.goi <- as.character(pickGuineaPigGenes(mtx))
   goi <- sort(rownames(mtx))
   printf("--- runMany on goi, length: %d", length(goi))
   configurationFileRead <- TRUE
   tfPrefilterCorrelation=0.1
   correlationThreshold=0.1
   tf.pool <- (intersect(trenaSGM::allKnownTFs(identifierType="geneSymbol"), mcols(MotifDb)$geneSymbol))
   tf.pool <- intersect(tf.pool, rownames(mtx))
   printf("using %d tfs, each with a MotifDb matrix and expression in mtx", length(tf.pool))
   use.geneHancer <- TRUE
   }
PriceLab/TrenaProjectLymphocyte documentation built on Sept. 2, 2019, 8 a.m.