TrenaProjectBrainCell: Temporal cortex gene expression deconvoluted into cell types

library(RUnit)
library(TrenaProjectBrainCell)

printf("--- reading config.R")

stopifnot(packageVersion("TrenaProject") >= "0.99.37")
#stopifnot(packageVersion("TrenaProjectBrainCell") >= "0.99.04")

trenaProject <- TrenaProjectBrainCell()

# I added some minor parcing of the expression matrix to get rid of genes with no expression.
#matrix.name <- "Micro_TYROBP"
#matrix.name <- "Astro_FGFR3"
#matrix.name <- "OPC_PDGFRA"
#matrix.name <- "Inh_ALL"
matrix.name <- "TCX_counts_scaled"
stopifnot(matrix.name %in% getExpressionMatrixNames(trenaProject))
mtx <- getExpressionMatrix(trenaProject, matrix.name)
#mtx.df <- as.data.frame(mtx)
#mtx.df$mean <- rowMeans(mtx.df)
#mtx.df <- mtx.df[(mtx.df$mean > 0.05),]
#mtx.df <- mtx.df[!(mtx.df$mean == 0),]
#mtx.df$mean <- NULL
#mtx <- as.matrix(mtx.df)

tbl.geneHancer <- get(load(system.file(package="TrenaProject", "extdata", "genomeAnnotation", "geneHancer.v4.7.allGenes.RData")))

# there might be a bettter solution, but for now, I'm going to make sure there's only one gene symbol per gene here
# the reason is that the determineRegulatoryRegions function doesn't currently handle things well when there's more than one entry for tbl.geneInfo
tbl.geneInfo <- get(load(system.file(package="TrenaProject", "extdata", "geneInfoTable_hg38.RData")))
tbl.geneInfo <- tbl.geneInfo[!duplicated(tbl.geneInfo$geneSymbol),]

OUTPUTDIR <- "/tmp/MODELS.cory.brain.bulk.scaled"

if(!file.exists(OUTPUTDIR))
   dir.create(OUTPUTDIR)

WORKERS <- 20
SOLVERS <- c("lasso", "lassopv", "pearson", "randomForest", "ridge", "spearman") # , "sqrtLasso")

LOGDIR <- file.path(OUTPUTDIR, "logs")

trenaProject <- TrenaProjectBrainCell()


desired.footprint.databases <- getFootprintDatabaseNames(trenaProject)

#------------------------------------------------------------------------------------------------------------------------
# we want one gene for which we have genehancer regions, and one without.
# we also want these genes to exhibit high variance across samples in the expression matrix
# these provide good (though not exhaustive) tests of building a gsm:
#   - different method for
pickGuineaPigGenes <- function(mtx)
{
   load(system.file(package="TrenaProject", "extdata", "genomeAnnotation", "geneHancer.v4.7.allGenes.RData"))
   tbl.geneHancer <- tbl.enhancers
   dim(tbl.geneHancer)

   load(system.file(package="TrenaProject", "extdata", "geneInfoTable_hg38.RData"))
   geneSymbols.in.mtx <- rownames(mtx)
   printf("geneSymbols all chromosomes: %d", length(geneSymbols.in.mtx))

   geneSymbols.with.enhancers <- intersect(geneSymbols.in.mtx, unique(tbl.geneHancer$geneSymbol))
   printf("genes with geneHancer annotation: %d/%d", length(geneSymbols.with.enhancers), length(geneSymbols.in.mtx))
   geneSymbols.without.enhancers <- setdiff(geneSymbols.in.mtx, unique(tbl.geneHancer$geneSymbol))  # 3174

     # choose one maximally variable gene from each set (with and w/o enhancers)
   x <- as.numeric(lapply(geneSymbols.without.enhancers,  function(gene) sd(mtx[gene,])))
   big.enough.sd <- fivenum(x)[4]
   genes.with.big.enough.sd <- rownames(mtx)[which(x > big.enough.sd)]
   candidates <- intersect(genes.with.big.enough.sd, geneSymbols.without.enhancers)   # 63
      # now make sure they are protein-coding genes
   also.protein.coding <- intersect(candidates, subset(tbl.geneInfo, type=="protein_coding")$geneSymbol) # 8
      # choose INKA2
   guineaPigGenes.without.enhancers <- "INKA2"

   x <- lapply(geneSymbols.with.enhancers,  function(gene) sd(mtx[gene,]))
   biggest.sd <- which(as.numeric(x) == max(as.numeric(x)))
   guineaPigGene.with.enhancers <- geneSymbols.with.enhancers[biggest.sd]        # GSTM1

   return(list(gene.with.enhancers=guineaPigGene.with.enhancers,
               gene.without.enhancers=guineaPigGenes.without.enhancers))

} # pickGuineaPigGenes
#------------------------------------------------------------------------------------------------------------------------
test_pickGuineaPigGenes <- function()
{
   printf("--- test_pickGuineaPigGenes")
   gpg <- pickGuineaPigGenes(mtx)
   gene.no <- gpg$gene.without.enhancers
   gene.yes <- gpg$gene.with.enhancers

      # check to see that with/without genehancer info is accurate
   checkTrue(gene.yes %in% tbl.geneHancer$geneSymbol)
   checkTrue(!gene.no %in% tbl.geneHancer$geneSymbol)

   checkTrue(sd(mtx[gene.yes,]) > 2)
   checkTrue(sd(mtx[gene.no,]) > 0.5)

} # test_pickGuineaPigGenes
#------------------------------------------------------------------------------------------------------------------------
# cory's AD method:
# if there are no enhancers entry: use tss +/- 5kb
# if there are enhancers, use those enhancers and +/- 2kb
#
determineRegulatoryRegions <- function(gene)
{
    tbl.concise <- tbl.geneInfo[which(tbl.geneInfo$geneSymbol == gene), c("chrom", "tss")]
      # no need to figure strand since we go 2500bp in both directions

   if(gene %in% tbl.geneHancer$geneSymbol){
      setTargetGene(trenaProject, gene)
      tbl.enhancers <- getEnhancers(trenaProject)[, c("chrom", "start", "end")]
      shoulder <- 2000
      tbl.promoter <- data.frame(chrom=tbl.concise$chrom,
                                 start=tbl.concise$tss - shoulder,
                                 end=tbl.concise$tss + shoulder,
                                 stringsAsFactors=FALSE)
      tbl.regions <- rbind(tbl.promoter, tbl.enhancers)
      }

   if(!gene %in% tbl.geneHancer$geneSymbol){
      shoulder <- 5000
      tbl.regions <- data.frame(chrom=tbl.concise$chrom,
                                 start=tbl.concise$tss - shoulder,
                                 end=tbl.concise$tss + shoulder,
                                 stringsAsFactors=FALSE)
      }

   new.order <- order(tbl.regions$start, decreasing=FALSE)
   tbl.regions <- tbl.regions[new.order,]
   rownames(tbl.regions) <- NULL
   tbl.reduced <- as.data.frame(union(GRanges(tbl.regions), GRanges(tbl.regions)))[, c("seqnames", "start", "end")]
   colnames(tbl.reduced) <- c("chrom", "start", "end")
   tbl.reduced$chrom <- as.character(tbl.reduced$chrom)

   return(tbl.reduced)

} # determineRegulatoryRegions
#------------------------------------------------------------------------------------------------------------------------
test_determineRegulatoryRegions <- function()
{
   printf("--- test_determineRegulatoryRegions")

   tbl.gh <- determineRegulatoryRegions("GSTM1")   # has genehancer regions
   checkTrue(nrow(tbl.gh) >= 10)
   tss.gstm1 <- subset(tbl.geneInfo, geneSymbol=="GSTM1")$tss
   gr.10kpromoter <- GRanges(data.frame(chr="chr1", start=tss.gstm1-2000, end=tss.gstm1+2000, stringsAsFactors=FALSE))
   gr.regions <- GRanges(tbl.gh)
   checkEquals(length(findOverlaps(gr.10kpromoter, gr.regions, type="within")), 1)

   tbl.no.gh <- determineRegulatoryRegions("RBMXP2")    # no genehancer regions
   checkEquals(dim(tbl.no.gh), c(1,3))
   with(tbl.no.gh, checkEquals(end - start, 10000))

} # test_determineRegulatoryRegions
#------------------------------------------------------------------------------------------------------------------------
# generate bash script to run genes with bplapply in chunks

max <- nrow(mtx)
starts <- seq(1, max, 20)
ends <- starts + 19
ends[length(ends)] <- max
bashScript <- file("runByChunks20.sh")
lines <- sprintf("Rscript runMany.R %d %d", starts, ends)
writeLines(lines, bashScript)
close(bashScript)
#------------------------------------------------------------------------------------------------------------------------
runTests <- function()
{
   test_pickGuineaPigGenes()
   test_determineRegulatoryRegions()

} # runTests
#------------------------------------------------------------------------------------------------------------------------
#runTests()

#test.goi <- as.character(pickGuineaPigGenes(mtx))

goi <- rownames(mtx)
configurationFileRead <- TRUE
tfPrefilterCorrelation=0.4
correlationThreshold=0.4
tf.pool <- (intersect(trenaSGM::allKnownTFs(identifierType="geneSymbol"), mcols(MotifDb)$geneSymbol))
tf.pool <- intersect(tf.pool, rownames(mtx))
printf("using %d tfs, each with a MotifDb matrix and expression in mtx", length(tf.pool))
use.geneHancer <- TRUE

PriceLab/TrenaProjectBrainCell documentation built on April 4, 2020, 3:56 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

PriceLab/TrenaProjectBrainCell
Temporal cortex gene expression deconvoluted into cell types

inst/gsm/config.R
In PriceLab/TrenaProjectBrainCell: Temporal cortex gene expression deconvoluted into cell types

R Package Documentation

Browse R Packages

We want your feedback!

PriceLab/TrenaProjectBrainCell Temporal cortex gene expression deconvoluted into cell types

inst/gsm/config.R In PriceLab/TrenaProjectBrainCell: Temporal cortex gene expression deconvoluted into cell types

R Package Documentation

Browse R Packages

We want your feedback!

PriceLab/TrenaProjectBrainCell
Temporal cortex gene expression deconvoluted into cell types

inst/gsm/config.R
In PriceLab/TrenaProjectBrainCell: Temporal cortex gene expression deconvoluted into cell types