library(RUnit)
library(GenomicRanges)
library(TrenaProjectLymphocyte)
library(MotifDb)
printf("--- reading config.R")
trenaProject <- TrenaProjectLymphocyte()
stopifnot(packageVersion("TrenaProject") >= "1.0.6")
stopifnot(packageVersion("TrenaProjectHG38") >= "0.99.45")
stopifnot(packageVersion("TrenaProjectLymphocyte") >= "0.99.21")
getExpressionMatrixNames(trenaProject)
matrix.name <- "GTEX.wholeBlood.rna-seq-geneSymbols.22330x407"
stopifnot(matrix.name %in% getExpressionMatrixNames(trenaProject))
mtx <- getExpressionMatrix(trenaProject, matrix.name)
some.key.genes <- c("IRF4", "TET2", "LAG3", "PDCD1", "HAVCR2") # jocelyn has PD1 and TIM3 for the last two
all(some.key.genes %in% rownames(mtx))
tbl.geneHancer <- get(load(system.file(package="TrenaProject", "extdata", "genomeAnnotation", "geneHancer.v4.7.allGenes.RData")))
tbl.geneInfo <- get(load(system.file(package="TrenaProject", "extdata", "geneInfoTable_hg38.RData")))
OUTPUTDIR <- "/local/gsm/TrenaProjectLymphocyte/2019-may-17-GTEX.wholeBlood-geneSymbols-22330x407"
if(!file.exists(OUTPUTDIR))
dir.create(OUTPUTDIR)
WORKERS <- 20
SOLVERS <- c("lasso", "lassopv", "pearson", "randomForest", "ridge", "spearman", "sqrtLasso")
LOGDIR <- file.path(OUTPUTDIR, "logs")
desired.footprint.databases <- getFootprintDatabaseNames(trenaProject)
#------------------------------------------------------------------------------------------------------------------------
# we want one gene for which we have genehancer regions, and one without.
# we also want these genes to exhibit high variance across samples in the expression matrix
# these provide good (though not exhaustive) tests of building a gsm:
# - different method for
pickGuineaPigGenes <- function(mtx)
{
tbl.geneHancer <- tbl.enhancers
dim(tbl.geneHancer)
load(system.file(package="TrenaProject", "extdata", "geneInfoTable_hg38.RData"))
geneSymbols.in.mtx <- rownames(mtx)
printf("geneSymbols all chromosomes: %d", length(geneSymbols.in.mtx))
geneSymbols.with.enhancers <- intersect(geneSymbols.in.mtx, unique(tbl.geneHancer$geneSymbol))
printf("genes with geneHancer annotation: %d/%d", length(geneSymbols.with.enhancers), length(geneSymbols.in.mtx))
geneSymbols.without.enhancers <- setdiff(geneSymbols.in.mtx, unique(tbl.geneHancer$geneSymbol)) # 3174
# choose one maximally variable gene from each set (with and w/o enhancers)
x <- as.numeric(lapply(geneSymbols.without.enhancers, function(gene) sd(mtx[gene,])))
big.enough.sd <- fivenum(x)[4]
genes.with.big.enough.sd <- rownames(mtx)[which(x > big.enough.sd)]
candidates <- intersect(genes.with.big.enough.sd, geneSymbols.without.enhancers) # 63
length(candidates)
# now make sure they are protein-coding genes
also.protein.coding <- intersect(candidates, subset(tbl.geneInfo, type=="protein_coding")$geneSymbol) # 8
length(also.protein.coding)
# choose INKA2
guineaPigGenes.without.enhancers <- "INKA2"
x <- lapply(geneSymbols.with.enhancers, function(gene) sd(mtx[gene,]))
biggest.sd <- which(as.numeric(x) == max(as.numeric(x)))
guineaPigGene.with.enhancers <- geneSymbols.with.enhancers[biggest.sd] # GSTM1
return(list(gene.with.enhancers=guineaPigGene.with.enhancers,
gene.without.enhancers=guineaPigGenes.without.enhancers))
} # pickGuineaPigGenes
#------------------------------------------------------------------------------------------------------------------------
test_pickGuineaPigGenes <- function()
{
printf("--- test_pickGuineaPigGenes")
gpg <- pickGuineaPigGenes(mtx)
gene.no <- gpg$gene.without.enhancers
gene.yes <- gpg$gene.with.enhancers
# check to see that with/without genehancer info is accurate
checkTrue(gene.yes %in% tbl.geneHancer$geneSymbol)
checkTrue(!gene.no %in% tbl.geneHancer$geneSymbol)
checkTrue(sd(mtx[gene.yes,]) > 3)
checkTrue(sd(mtx[gene.no,]) > 0.45)
} # test_pickGuineaPigGenes
#------------------------------------------------------------------------------------------------------------------------
# cory's AD method:
# if there are no enhancers entry: use tss +/- 5kb
# if there are enhancers, use those enhancers and +/- 2kb
#
determineRegulatoryRegions <- function(gene)
{
tbl.concise <- tbl.geneInfo[grep(gene, tbl.geneInfo$geneSymbol), c("chrom", "tss")]
# no need to figure strand since we go 2500bp in both directions
if(gene %in% tbl.geneHancer$geneSymbol){
setTargetGene(trenaProject, gene)
tbl.enhancers <- getEnhancers(trenaProject)[, c("chrom", "start", "end")]
shoulder <- 2000
tbl.promoter <- data.frame(chrom=tbl.concise$chrom,
start=tbl.concise$tss - shoulder,
end=tbl.concise$tss + shoulder,
stringsAsFactors=FALSE)
tbl.regions <- rbind(tbl.promoter, tbl.enhancers)
}
if(!gene %in% tbl.geneHancer$geneSymbol){
shoulder <- 5000
tbl.regions <- data.frame(chrom=tbl.concise$chrom,
start=tbl.concise$tss - shoulder,
end=tbl.concise$tss + shoulder,
stringsAsFactors=FALSE)
}
new.order <- order(tbl.regions$start, decreasing=FALSE)
tbl.regions <- tbl.regions[new.order,]
rownames(tbl.regions) <- NULL
tbl.reduced <- as.data.frame(union(GRanges(tbl.regions), GRanges(tbl.regions)))[, c("seqnames", "start", "end")]
colnames(tbl.reduced) <- c("chrom", "start", "end")
tbl.reduced$chrom <- as.character(tbl.reduced$chrom)
return(tbl.reduced)
} # determineRegulatoryRegions
#------------------------------------------------------------------------------------------------------------------------
test_determineRegulatoryRegions <- function()
{
printf("--- test_determineRegulatoryRegions")
tbl.gh <- determineRegulatoryRegions("GSTM1") # has genehancer regions
checkTrue(nrow(tbl.gh) >= 10)
tss.gstm1 <- subset(tbl.geneInfo, geneSymbol=="GSTM1")$tss
gr.10kpromoter <- GRanges(data.frame(chr="chr1", start=tss.gstm1-2000, end=tss.gstm1+2000, stringsAsFactors=FALSE))
gr.regions <- GRanges(tbl.gh)
checkEquals(length(findOverlaps(gr.10kpromoter, gr.regions, type="within")), 1)
tbl.no.gh <- determineRegulatoryRegions("RBMXP2") # no genehancer regions
checkEquals(dim(tbl.no.gh), c(1,3))
with(tbl.no.gh, checkEquals(end - start, 10000))
} # test_determineRegulatoryRegions
#------------------------------------------------------------------------------------------------------------------------
runTests <- function()
{
test_pickGuineaPigGenes()
test_determineRegulatoryRegions()
} # runTests
#------------------------------------------------------------------------------------------------------------------------
if(!interactive()){
runTests()
test.goi <- as.character(pickGuineaPigGenes(mtx))
goi <- sort(rownames(mtx))
printf("--- runMany on goi, length: %d", length(goi))
configurationFileRead <- TRUE
tfPrefilterCorrelation=0.1
correlationThreshold=0.1
tf.pool <- (intersect(trenaSGM::allKnownTFs(identifierType="geneSymbol"), mcols(MotifDb)$geneSymbol))
tf.pool <- intersect(tf.pool, rownames(mtx))
printf("using %d tfs, each with a MotifDb matrix and expression in mtx", length(tf.pool))
use.geneHancer <- TRUE
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.